In [178]:
import numpy as np
import pandas as pd

In [179]:
# open final data
data = pd.read_csv('../../data/processed/final_data.csv')

# get the list of columns
columns = data.columns
positive_columns = [col for col in columns if '+' in col]
negative_columns = [col for col in columns if '-' in col]
original_columns = [col for col in columns if '+' not in col and '-' not in col]

In [180]:
print(original_columns)

['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'year', 'month', 'day', 'hour', 'dataset', 'dayofyear', 'weekofyear', 'dayofweek']


In [181]:
# use only the training data
model_training_data = data[data['dataset'] == 'train']

# convert the datetime columns to datetime
model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])


In [182]:
# mask the data from day 3 to 15 of each month using 'datetime' column
negative_train_mask = model_training_data['datetime'].apply(lambda x: x.day >= 3 and x.day <= 15)
negative_test_mask = model_training_data['datetime'].apply(lambda x: x.day > 15)

# get the negative training data
negative_train_data = model_training_data[negative_train_mask][original_columns + negative_columns].copy()
negative_test_data = model_training_data[negative_test_mask][original_columns + negative_columns].copy()

# maske the data from day 5 to 17 of each month using 'datetime' column
positive_train_mask = model_training_data['datetime'].apply(lambda x: x.day >= 5 and x.day <= 17)
positive_test_mask = model_training_data['datetime'].apply(lambda x: x.day < 5)

# get the positive training data
positive_train_data = model_training_data[positive_train_mask][original_columns + positive_columns].copy()
positive_test_data = model_training_data[positive_test_mask][original_columns + positive_columns].copy()

In [183]:
# for each train or test assert there are no NaN values
assert not negative_train_data.isnull().values.any()
assert not negative_test_data.isnull().values.any()
assert not positive_train_data.isnull().values.any()
assert not positive_test_data.isnull().values.any()

In [184]:
# imoprt pipeline, scaler, RFregressor, MLPregressor and metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

target_columns = ['casual', 'registered']

for target in target_columns:
    for direction in ['positive', 'negative']:

        # create random forest pipeline for the target and direction
        globals()[f'{direction}_{target}_pipeline_1'] = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', RandomForestRegressor(n_estimators=100)),
        ])
        
        """
        # create MLP pipeline for the target and direction
        globals()[f'{direction}_{target}_mlp_pipeline_2'] = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', MLPRegressor(max_iter=2000, hidden_layer_sizes=(100, 100), verbose=True, n_iter_no_change=25)),
        ])
        """

In [185]:
# select columns not to be used in the model
columns_not_to_use = ['datetime', 'dataset', 'day', 'month', 'year', 'casual', 'registered', 'count']

# train the models
for target in target_columns:
    for direction in ['positive', 'negative']:
        # train the first pipeline
        globals()[f'{direction}_{target}_pipeline_1'].fit(
            globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1),
            globals()[f'{direction}_train_data'][target]
        )

        # save the most important feature into a dataframe
        feature_importances = pd.DataFrame(
            globals()[f'{direction}_{target}_pipeline_1'].named_steps['regressor'].feature_importances_,
            index=globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1).columns,
            columns=['importance']
        )

        # sort the dataframe
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)

        # print the most important features
        print(f'{direction} {target} most important features:')
        print(feature_importances)
        print('')

        # train the second pipeline
        globals()[f'{direction}_{target}_mlp_pipeline_2'].fit(
            globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1),
            globals()[f'{direction}_train_data'][target]
        )

positive casual most important features:
                importance
casual, +1        0.531137
dayofweek         0.161453
humidity          0.070568
atemp             0.027914
hour              0.025007
casual, +2        0.023519
registered, +1    0.018377
temp              0.016690
humidity, +1      0.015712
humidity, +2      0.012161
windspeed, +2     0.009579
registered, +2    0.009482
windspeed         0.008177
windspeed, +1     0.007368
atemp, +2         0.006897
temp, +1          0.006791
workingday        0.006438
temp, +2          0.006303
atemp, +1         0.005351
weather, +1       0.004699
dayofyear         0.004600
holiday, +1       0.004250
holiday, +2       0.003412
weekofyear        0.003247
workingday, +2    0.002641
holiday           0.002340
weather           0.002147
workingday, +1    0.001839
weather, +2       0.001248
season            0.000652

Iteration 1, loss = 1435.60512487
Iteration 2, loss = 500.50356986
Iteration 3, loss = 360.57507992
Iteration 4, loss = 3

In [186]:
# evaluate the models

for target in target_columns:
    for direction in ['positive', 'negative']:

        # print the statetment of target and direction
        print(f'{direction} {target} evaluation')

        # predict the target using the first pipeline
        globals()[f'{direction}_{target}_predictions_1'] = globals()[f'{direction}_{target}_pipeline_1'].predict(
            globals()[f'{direction}_test_data'].drop(columns=columns_not_to_use, axis=1)
        )

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae_1'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions_1'])

        # print the results
        print(f'MAE RF: {globals()[f"{direction}_{target}_mae_1"]}')

        # predict the target using the second pipeline
        globals()[f'{direction}_{target}_predictions_2'] = globals()[f'{direction}_{target}_mlp_pipeline_2'].predict(
            globals()[f'{direction}_test_data'].drop(columns=columns_not_to_use, axis=1)
        )

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae_2'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions_2'])

        # print the results
        print(f'MAE MLP: {globals()[f"{direction}_{target}_mae_2"]}')

        # take the average of the two predictions
        globals()[f'{direction}_{target}_predictions'] = (globals()[f'{direction}_{target}_predictions_1'] + globals()[f'{direction}_{target}_predictions_2']) / 2

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions'])

        # print the results
        print(f'MAE AVG: {globals()[f"{direction}_{target}_mae"]}')
        print('')

positive casual evaluation
MAE RF: 13.076622299497112
MAE MLP: 16.805391534946576
MAE AVG: 13.48302419342169

negative casual evaluation
MAE RF: 11.77374913136866
MAE MLP: 15.853117782956026
MAE AVG: 12.480025738556735

positive registered evaluation
MAE RF: 29.64201440282047
MAE MLP: 38.71121546399405
MAE AVG: 30.3797139428525

negative registered evaluation
MAE RF: 28.529174921094917
MAE MLP: 37.43387570976768
MAE AVG: 29.481224579370814



In [187]:
# evaluate the models:

for target in target_columns:
    for direction in ['positive', 'negative']:

SyntaxError: incomplete input (3423881943.py, line 4)