In [17]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn. linear_model import Lasso
from sklearn. feature_selection import SelectFromModel

In [18]:
# preprocess data

def preprocess_data(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%y %H:%M')
    # Convert 'NA' to np.nan for all columns
    df.set_index('datetime', inplace=True)
    data_hourly = df.resample('H').agg({
        'vid': 'sum',
        't2m': 'mean',
        'sp': 'mean',
        'spDayBefore': 'mean',
        'tcc': 'mean',
        'tp': 'mean',
        'tHeightAvg': 'mean',
        'uHeightAvg': 'mean',
        'vHeightAvg': 'mean',
        'u10': 'mean',
        'v10': 'mean',
        'u100': 'mean',
        'v100': 'mean',
        'Noorway_sp': 'mean',
        'MeckPom_sp': 'mean',
        'Denmark_sp': 'mean',
        'Sweden_sp': 'mean',
        'Noorway_u10': 'mean',
        'Noorway_v10': 'mean',
        'MeckPom_u10': 'mean',
        'MeckPom_v10': 'mean',
        'Denmark_u10': 'mean',
        'Denmark_v10': 'mean',
        'Sweden_u10': 'mean',
        'Sweden_v10': 'mean'
    })
    data_hourly = data_hourly.reset_index()
    data_hourly = data_hourly.replace([np.inf, -np.inf], np.nan)
    data_hourly = data_hourly.dropna(subset=['t2m']) 
    data_hourly['hour'] = data_hourly['datetime'].dt.hour
    data_hourly['day'] = data_hourly['datetime'].dt.day
    data_hourly['month'] = data_hourly['datetime'].dt.month
    data_hourly['year'] = data_hourly['datetime'].dt.year
    
    # df = df[(df['hour'] >= 8) & (df['hour'] <= 16)]
    # #round temperature to nearest integer
    data_hourly['t2m'] = data_hourly['t2m'].round().astype(int)
    # #filter t2m between 272 and 295
    # df = df[(df['t2m'] >= 272) & (df['t2m'] <= 295)]
    # # round sp to nearest integer
    data_hourly['sp'] = data_hourly['sp'].round().astype(int)
    # #filter sp between 99500 and 103600
    # df = df[(df['sp'] >= 99500) & (df['sp'] <= 103600)]
    # # round spDayBefore to nearest integer
    data_hourly['spDayBefore'] = data_hourly['spDayBefore'].round().astype(int)
    # #filter spDayBefore between 99600 and 103500
    # df = df[(df['spDayBefore'] >= 99600) & (df['spDayBefore'] <= 103500)]
    # # filter tp between 0 and 0.003
    # df = df[(df['tp'] >= 0) & (df['tp'] <= 0.003)]
    # #round tHeightAvg to nearest integer
    data_hourly['tHeightAvg'] = data_hourly['tHeightAvg'].round(0).astype(int)
    # filter tHeightAvg between 271 and 292
    # df = df[(df['tHeightAvg'] >= 271) & (df['tHeightAvg'] <= 292)]
    # # filter uHeightAvg between -15 and 15
    # df = df[(df['uHeightAvg'] >= -15) & (df['uHeightAvg'] <= 15)]
    # # filter vHeightAvg between -15 and 15
    # df = df[(df['vHeightAvg'] >= -15) & (df['vHeightAvg'] <= 15)]
    # # filter u10 between -5 and 9
    # df = df[(df['u10'] >= -5) & (df['u10'] <= 9)]
    # # filter v10 between -5 and 6
    # df = df[(df['v10'] >= -5) & (df['v10'] <= 6)]
    # # filter u100 between -10 and 13
    # df = df[(df['u100'] >= -10) & (df['u100'] <= 13)]
    # # filter v100 between -9 and 11
    # df = df[(df['v100'] >= -9) & (df['v100'] <= 11)]
    # filter Noorway_sp between 91300 and 96800
    # df = df[(df['Noorway_sp'] >= 91300) & (df['Noorway_sp'] <= 96800)]
    # # filter MeckPom_sp between 98500 and 103000
    # df = df[(df['MeckPom_sp'] >= 98500) & (df['MeckPom_sp'] <= 103000)]
    # # filter Denmark_sp between 98000 and 103400
    # df = df[(df['Denmark_sp'] >= 98000) & (df['Denmark_sp'] <= 103400)]
    # # filter Sweden_sp between 97500 and 101800
    # df = df[(df['Sweden_sp'] >= 97500) & (df['Sweden_sp'] <= 101800)]
    # # filter Noorway_u10 between -5.5 and 6.5
    # df = df[(df['Noorway_u10'] >= -5.5) & (df['Noorway_u10'] <= 6.5)]
    # # flter Noorway_v10 between -3.5 and 4.5
    # df = df[(df['Noorway_v10'] >= -3.5) & (df['Noorway_v10'] <= 4.5)]
    # # filter MeckPom_u10 between -5 and 7.5
    # df = df[(df['MeckPom_u10'] >= -5) & (df['MeckPom_u10'] <= 7.5)]
    # # filter MeckPom_v10 between -6.5 and 5
    # df = df[(df['MeckPom_v10'] >= -6.5) & (df['MeckPom_v10'] <= 5)]
    # # filter Denmark_u10 between -6 and 9
    # df = df[(df['Denmark_u10'] >= -6) & (df['Denmark_u10'] <= 9)]
    # # filter Denmark_v10 between -6.5 and 8
    # df = df[(df['Denmark_v10'] >= -6.5) & (df['Denmark_v10'] <= 8)]
    # # filter Sweden_u10 between -6 and 7
    # df = df[(df['Sweden_u10'] >= -6) & (df['Sweden_u10'] <= 7)]
    # # filter Sweden_v10 between -5 and 5
    # df = df[(df['Sweden_v10'] >= -5) & (df['Sweden_v10'] <= 5)]

    #filter out vid more than 100
    data_hourly = data_hourly[data_hourly['vid'] <= 600]

    return data_hourly


In [19]:
# Load the dataset (assuming you have a CSV file with 'datetime' and 'temperature' columns)
missing_values = ["n/a", "na", "--", "NA", "N/A"]

fallDataTrain = pd.read_csv("../datasets/fallDataEx19.csv", na_values = missing_values)
fallDataTrain = fallDataTrain.dropna()
fallDataTrain = preprocess_data(fallDataTrain);

fallDataTest = pd.read_csv("../datasets/fallData19.csv", na_values = missing_values)
fallDataTest = fallDataTest.dropna()
fallDataTest = preprocess_data(fallDataTest);

fallDataTrain.replace([np.inf, -np.inf], np.nan, inplace=True)
fallDataTest.replace([np.inf, -np.inf], np.nan, inplace=True)

fallDataTrain.dropna(inplace=True)
fallDataTest.dropna(inplace=True)

# ['minute','hour', 'date', 'month', 't2m', 'sp', 'spDayBefore', 'tcc', 'tp', 'tHeightAvg', 'uHeightAvg',
#             'vHeightAvg', 'u10','v10', 'u100', 'v100', 'Noorway_sp', 'MeckPom_sp', 'Denmark_sp', 'Sweden_sp', 
#             'Noorway_u10', 'Noorway_v10', 'MeckPom_u10', 'MeckPom_v10', 'Denmark_u10', 'Denmark_v10',
#             'Sweden_u10', 'Sweden_v10']
# ['hour','uHeightAvg','MeckPom_v10','t2m','spDayBefore','sp','month', 'tHeightAvg',  'day']
features = ['hour', 'day', 'month', 't2m', 'sp', 'spDayBefore', 'tp', 'tHeightAvg', 'uHeightAvg',
            'vHeightAvg', 'u10','v10', 'u100', 'v100', 'Noorway_sp', 'MeckPom_sp', 'Denmark_sp', 'Sweden_sp', 
            'Noorway_u10', 'Noorway_v10', 'MeckPom_u10', 'MeckPom_v10', 'Denmark_u10', 'Denmark_v10',
            'Sweden_u10', 'Sweden_v10']

X_train = fallDataTrain[features]
y_train = fallDataTrain['vid']

X_test = fallDataTest[features]
y_test = fallDataTest['vid']
print(len(X_train))

24025


In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005,max_iter=10000, random_state=0))
feature_sel_model.fit(X_train, y_train)
print(feature_sel_model.get_support())

[ True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]


In [13]:
feature_sel_model_scaled = SelectFromModel(Lasso(alpha=0.005,max_iter=10000, random_state=0))
feature_sel_model_scaled.fit(X_test_scaled, y_test)
print(feature_sel_model_scaled.get_support())

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]


In [22]:
# let's print the number of total and selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]
# let's print some stats
print('total features: {}'. format((X_train.shape[1])))
print('selected features: {}'. format(len(selected_feat) ))
print('features with coefficients shrank to zero: {}'. format(np.sum(feature_sel_model.estimator_.coef_== 0)))
# let's print some stats
print(selected_feat)


total features: 26
selected features: 25
features with coefficients shrank to zero: 1
Index(['hour', 'day', 'month', 't2m', 'sp', 'spDayBefore', 'tHeightAvg',
       'uHeightAvg', 'vHeightAvg', 'u10', 'v10', 'u100', 'v100', 'Noorway_sp',
       'MeckPom_sp', 'Denmark_sp', 'Sweden_sp', 'Noorway_u10', 'Noorway_v10',
       'MeckPom_u10', 'MeckPom_v10', 'Denmark_u10', 'Denmark_v10',
       'Sweden_u10', 'Sweden_v10'],
      dtype='object')


In [23]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [24]:
# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.3))  # Dropout layer to prevent overfitting
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))  # Dropout layer to prevent overfitting
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))  # Linear activation for regression

# Compile the model with Adam optimizer and mean squared error loss
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=64, validation_split=0.2, verbose=1)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_rounded = np.round(y_pred).astype(int)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rounded)
mse = mean_squared_error(y_test, y_pred_rounded)
r2 = r2_score(y_test, y_pred_rounded)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [7]:
base_estimator = DecisionTreeRegressor(random_state=42)
ada_model = AdaBoostRegressor(estimator=base_estimator, n_estimators=100, learning_rate=0.1, random_state=42)
ada_model.fit(X_train, y_train)

# Step 4: Predict using the AdaBoost model
y_pred = ada_model.predict(X_test)
y_pred_rounded = np.round(y_pred).astype(int)

# Step 5: Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rounded)
mse = mean_squared_error(y_test, y_pred_rounded)
r2 = r2_score(y_test, y_pred_rounded)

print("ADA Boost")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

ADA Boost
Mean Absolute Error: 3.1578301670785627
Mean Squared Error: 58.601682696636
R2 Score: 0.1585903267724561


In [28]:
print("🚀 ~ y_pred_rounded:",y_pred_rounded)

🚀 ~ y_pred_rounded: [2 2 2 ... 2 2 2]


In [29]:
print("🚀 ~ y_pred_test:",y_test)

🚀 ~ y_pred_test: 0        3
1        2
2        2
3        3
4        2
        ..
34411    3
34412    3
34413    3
34414    3
34415    3
Name: vid, Length: 34416, dtype: int64
