In [143]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [144]:
# open final data
data = pd.read_csv('../../data/processed/final_data.csv')

# get the list of columns
columns = data.columns
positive_columns = [col for col in columns if '+' in col]
negative_columns = [col for col in columns if '-' in col]
original_columns = [col for col in columns if '+' not in col and '-' not in col]

In [145]:
# use only the training data
model_training_data = data[data['dataset'] == 'train']

# convert the datetime columns to datetime
model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])


In [146]:
test_period = 4
lag_period = 2
maximum_day = 20

# mask the data from day 3 to 15 of each month using 'datetime' column
negative_train_mask = model_training_data['datetime'].apply(lambda x: x.day > lag_period and x.day < maximum_day - test_period)
negative_test_mask = model_training_data['datetime'].apply(lambda x: x.day >= maximum_day - test_period)

# get the negative training data
negative_train_data = model_training_data[negative_train_mask][original_columns + negative_columns].copy()
negative_test_data = model_training_data[negative_test_mask][original_columns + negative_columns].copy()

# maske the data from day 5 to 17 of each month using 'datetime' column
positive_train_mask = model_training_data['datetime'].apply(lambda x: x.day > test_period and x.day < maximum_day - lag_period)
positive_test_mask = model_training_data['datetime'].apply(lambda x: x.day <= test_period)

# get the positive training data
positive_train_data = model_training_data[positive_train_mask][original_columns + positive_columns].copy()
positive_test_data = model_training_data[positive_test_mask][original_columns + positive_columns].copy()

# save the data into a dictionary
training_data = {
    'positive': {
        'X': positive_train_data,
        'y': positive_train_data
        },
    'negative': {
        'X': negative_train_data,
        'y': negative_train_data
        }
    }

In [147]:
training_data['negative']['X'].columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'dataset', 'dayofyear', 'weekofyear',
       'dayofweek', 'windspeed, -1', 'atemp, -1', 'humidity, -1',
       'holiday, -1', 'workingday, -1', 'weather, -1', 'temp, -1',
       'casual, -1', 'registered, -1', 'count, -1', 'windspeed, -2',
       'atemp, -2', 'humidity, -2', 'holiday, -2', 'workingday, -2',
       'weather, -2', 'temp, -2', 'casual, -2', 'registered, -2', 'count, -2'],
      dtype='object')

In [148]:
# for each train or test assert there are no NaN values
assert not negative_train_data.isnull().values.any()
assert not negative_test_data.isnull().values.any()
assert not positive_train_data.isnull().values.any()
assert not positive_test_data.isnull().values.any()

In [149]:
def create_pipelines(target_columns, directions, rf=True, mlp=False):
    # create a dictionary to store the pipelines
    pipelines = {}

    for target in target_columns:
        for direction in directions:
                
            if rf:
                # create random forest pipeline for the target and direction
                globals()[f'{direction}_{target}_pipeline_rf'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', RandomForestRegressor(n_estimators=2)),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target}_pipeline_rf'] = globals()[f'{direction}_{target}_pipeline_rf']
            
            if mlp:
                # create MLP pipeline for the target and direction
                globals()[f'{direction}_{target}_pipeline_mlp'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', MLPRegressor(max_iter=2000, hidden_layer_sizes=(100, 100), verbose=True, n_iter_no_change=25)),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target}_pipeline_mlp'] = globals()[f'{direction}_{target}_pipeline_mlp']

    return pipelines

In [150]:
def fit_pipelines(pipelines, train_data, target_columns, columns_not_to_use, directions, rf = True, mlp = False):
    # create a dictionary to store the fitted pipelines
    fitted_pipelines = {}

    for target in target_columns:
        print(f'Fitting pipelines for {target}')
        
        for direction in directions:

            df = train_data[direction]['X'].copy()
            target_data = df[target]
            drop_coumns = columns_not_to_use + target_columns
            df = df.drop(drop_coumns, axis=1)
            
            # get the pipelines
            if rf:
                pipeline_rf = pipelines[f'{direction}_{target}_pipeline_rf']
                
                # fit the pipeline
                pipeline_rf.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target}_pipeline_rf'] = pipeline_rf

                # create a dataframe to store the feature importances
                feature_importances = pd.DataFrame({
                    'feature': df.columns,
                    'importance': pipeline_rf.named_steps['regressor'].feature_importances_
                })

                # sort the features by importance
                feature_importances = feature_importances.sort_values('importance', ascending=False)

                # print the feature importances
                print(f'{direction}_{target}_pipeline_rf')
                print(feature_importances)

            if mlp:
                pipeline_mlp = pipelines[f'{direction}_{target}_pipeline_mlp']
                
                # fit the pipeline
                pipeline_mlp.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target}_pipeline_mlp'] = pipeline_mlp

    return fitted_pipelines

In [151]:
def store_prediction_with_lags(df, mask, direction, target, prediction):

    # store the prediction in the masked dataframe
    df.loc[mask, target] = prediction

    lags = [1, 2]

    if direction == 'negative':
        lags = [-1, -2]

    # get the 'datetime' column for the masked data
    datetime = df[mask]['datetime']

    for lag in lags:
        # get the lagged 'datetime' column
        lagged_datetime = datetime + pd.DateOffset(days=lag)

        # get the mask for the lagged 'datetime' column
        lagged_mask = df['datetime'].isin(lagged_datetime)

        # insert the prediction into the dataframe subsetted by the lagged mask
        df.loc[lagged_mask, target] = prediction

    return df

In [152]:
def predict_pipelines(fitted_pipelines, training_data, 
                      target_columns, columns_not_to_use, 
                      test_period, lag_period, maximum_day,
                      directions, rf=True, mlp=False):

    # create a dictionary to store the predictions
    predictions = {}
    dataframes = {}

    for target in target_columns:

        for direction in directions:

            # get the y data
            df = training_data[direction]['y'].copy()
            drop_columns = columns_not_to_use + target_columns

            if direction == 'negative':
                start_day = maximum_day - test_period
            elif direction == 'positive':
                start_day = test_period

            day = start_day
            days_predicted = 0

            while days_predicted < test_period:
                # mask the data to select all rows corresponding to a day of the month equal to start_day
                mask = df['datetime'].apply(lambda x: x.day == day)

                # get the data
                df_days = df[mask].copy()
                df_days = df_days.drop(drop_columns, axis=1)

                # get the pipeline
                if rf:
                    pipeline1 = fitted_pipelines[f'{direction}_{target}_pipeline_rf']
                if mlp:
                    pipeline2 = fitted_pipelines[f'{direction}_{target}_pipeline_mlp']

                # take the mean of the predictions if both pipelines are used
                if rf and mlp:
                    prediction1 = pipeline1.predict(df_days)
                    prediction2 = pipeline2.predict(df_days)
                    prediction = (prediction1 + prediction2) / 2
                elif rf:
                    prediction = pipeline1.predict(df_days)
                elif mlp:
                    prediction = pipeline2.predict(df_days)

                # store the prediction
                df = store_prediction_with_lags(df, mask, direction, target, prediction)
            
                if direction == 'negative':
                    day += 1

                elif direction == 'positive':
                    day -= 1

            prediction = df[target]

            # assert there are no NaN values in the prediction
            assert not prediction.isna().values.any()

            # store the predictions
            predictions[f'{direction}_{target}'] = prediction

            # store the dataframe
            dataframes[f'{direction}_{target}'] = df

    return predictions, dataframes

In [153]:
def evaluate_pipelines(predictions, training_data, target_columns, directions):
    # create a dictionary to store the evaluation metrics
    evaluation = {}

    for target in target_columns:
        for direction in directions:

            # get the y data
            y = training_data[direction]['y'][target]

            # get the predictions
            prediction = predictions[f'{direction}_{target}']

            # calculate the evaluation metrics
            mse = mean_squared_error(y, prediction)
            mae = mean_absolute_error(y, prediction)

            # save the evaluation metrics
            evaluation[f'{direction}_{target}'] = {
                'mse': mse,
                'mae': mae
            }

            # print the evaluation metrics
            print(f'{direction}_{target}')
            print(f'MSE: {mse}')
            print(f'MAE: {mae}')

    return evaluation

In [154]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month', 'year', 'casual', 'registered', 'count']
target_columns = ['casual', 'registered', 'count']
directions = ['negative', 'positive']
rf = True
mlp = False

# create the pipelines
pipelines = create_pipelines(target_columns, directions, rf=rf, mlp=mlp)

test_period = 4
lag_period = 2
maximum_day = 20

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, training_data, 
                                 target_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, training_data, 
                                target_columns, columns_not_to_use, 
                                test_period, lag_period, maximum_day,
                                directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation = evaluate_pipelines(predictions, training_data, target_columns, directions)



Fitting pipelines for casual
negative_casual_pipeline_rf
           feature  importance
19      casual, -1    0.557664
2       workingday    0.118125
6         humidity    0.077284
29      casual, -2    0.043451
11       dayofweek    0.032330
8             hour    0.032056
5            atemp    0.015320
7        windspeed    0.013417
21       count, -1    0.012643
4             temp    0.009240
9        dayofyear    0.008803
12   windspeed, -1    0.008563
24    humidity, -2    0.007599
31       count, -2    0.006545
28        temp, -2    0.006195
18        temp, -1    0.006068
30  registered, -2    0.005526
22   windspeed, -2    0.005416
23       atemp, -2    0.005289
20  registered, -1    0.004942
14    humidity, -1    0.004834
10      weekofyear    0.004406
13       atemp, -1    0.004382
16  workingday, -1    0.002913
3          weather    0.001977
17     weather, -1    0.001574
27     weather, -2    0.001522
1          holiday    0.001055
0           season    0.000616
15     holida

ValueError: Found array with 0 sample(s) (shape=(0, 32)) while a minimum of 1 is required by StandardScaler.

In [None]:
# select columns not to be used in the model
columns_not_to_use = ['datetime', 'dataset', 'day', 'month', 'year', 'casual', 'registered', 'count']

# train the models
for target in target_columns:
    for direction in ['positive', 'negative']:
        # train the first pipeline
        globals()[f'{direction}_{target}_pipeline_1'].fit(
            globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1),
            globals()[f'{direction}_train_data'][target]
        )

        # save the most important feature into a dataframe
        feature_importances = pd.DataFrame(
            globals()[f'{direction}_{target}_pipeline_1'].named_steps['regressor'].feature_importances_,
            index=globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1).columns,
            columns=['importance']
        )

        # sort the dataframe
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)

        # print the most important features
        print(f'{direction} {target} most important features:')
        print(feature_importances)
        print('')

        # train the second pipeline
        globals()[f'{direction}_{target}_mlp_pipeline_2'].fit(
            globals()[f'{direction}_train_data'].drop(columns=columns_not_to_use, axis=1),
            globals()[f'{direction}_train_data'][target]
        )

KeyboardInterrupt: 

In [None]:
# evaluate the models

for target in target_columns:
    for direction in ['positive', 'negative']:

        # print the statetment of target and direction
        print(f'{direction} {target} evaluation')

        # predict the target using the first pipeline
        globals()[f'{direction}_{target}_predictions_1'] = globals()[f'{direction}_{target}_pipeline_1'].predict(
            globals()[f'{direction}_test_data'].drop(columns=columns_not_to_use, axis=1)
        )

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae_1'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions_1'])

        # print the results
        print(f'MAE RF: {globals()[f"{direction}_{target}_mae_1"]}')

        # predict the target using the second pipeline
        globals()[f'{direction}_{target}_predictions_2'] = globals()[f'{direction}_{target}_mlp_pipeline_2'].predict(
            globals()[f'{direction}_test_data'].drop(columns=columns_not_to_use, axis=1)
        )

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae_2'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions_2'])

        # print the results
        print(f'MAE MLP: {globals()[f"{direction}_{target}_mae_2"]}')

        # take the average of the two predictions
        globals()[f'{direction}_{target}_predictions'] = (globals()[f'{direction}_{target}_predictions_1'] + globals()[f'{direction}_{target}_predictions_2']) / 2

        # calculate the mean absolute error
        globals()[f'{direction}_{target}_mae'] = mean_absolute_error(globals()[f'{direction}_test_data'][target], globals()[f'{direction}_{target}_predictions'])

        # print the results
        print(f'MAE AVG: {globals()[f"{direction}_{target}_mae"]}')
        print('')

    # combine predictions from both directions
    globals()[f'{target}_predictions'] = (globals()[f'positive_{target}_predictions'] + globals()[f'negative_{target}_predictions']) / 2

    # calculate the mean absolute error
    globals()[f'{target}_mae'] = mean_absolute_error(data[data['dataset'] == 'test'][target], globals()[f'{target}_predictions'])

    # print the results
    print(f'{target} MAE: {globals()[f"{target}_mae"]}')
    print('\n')

NameError: name 'target_columns' is not defined

In [None]:
# evaluate the models:

for target in target_columns:
    for direction in ['positive', 'negative']:

SyntaxError: incomplete input (3423881943.py, line 4)