In [14]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA

In [15]:
# open final data
data = pd.read_csv('../../data/processed/final_data.csv')

# get the list of columns
columns = data.columns
positive_columns = [col for col in columns if '+' in col]
negative_columns = [col for col in columns if '-' in col]
original_columns = [col for col in columns if '+' not in col and '-' not in col]

In [16]:
# use only the training data
model_training_data = data[data['dataset'] == 'train']

# convert the datetime columns to datetime
model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])


In [17]:
test_period = 4
lag_period = 2
maximum_day = 20

# mask the data from day 3 to 15 of each month using 'datetime' column
negative_train_mask = model_training_data['datetime'].apply(lambda x: x.day > lag_period and x.day < maximum_day - test_period)
negative_test_mask = model_training_data['datetime'].apply(lambda x: x.day >= maximum_day - test_period)

# get the negative training data
negative_train_data = model_training_data[negative_train_mask][original_columns + negative_columns].copy()
negative_test_data = model_training_data[negative_test_mask][original_columns + negative_columns].copy()

# maske the data from day 5 to 17 of each month using 'datetime' column
positive_train_mask = model_training_data['datetime'].apply(lambda x: x.day > test_period and x.day < maximum_day - lag_period)
positive_test_mask = model_training_data['datetime'].apply(lambda x: x.day <= test_period)

# get the positive training data
positive_train_data = model_training_data[positive_train_mask][original_columns + positive_columns].copy()
positive_test_data = model_training_data[positive_test_mask][original_columns + positive_columns].copy()

# save the data into a dictionary
training_data = {
    'positive': {
        'X': positive_train_data,
        'y': positive_test_data
        },
    'negative': {
        'X': negative_train_data,
        'y': negative_test_data
        }
    }

In [18]:
training_data['negative']['X'].columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'dataset', 'count_original',
       'registered_original', 'casual_original', 'dayofyear', 'weekofyear',
       'dayofweek', 'windspeed, -1', 'atemp, -1', 'humidity, -1',
       'holiday, -1', 'workingday, -1', 'weather, -1', 'temp, -1',
       'casual, -1', 'registered, -1', 'count, -1', 'windspeed, -2',
       'atemp, -2', 'humidity, -2', 'holiday, -2', 'workingday, -2',
       'weather, -2', 'temp, -2', 'casual, -2', 'registered, -2', 'count, -2'],
      dtype='object')

In [19]:
# for each train or test assert there are no NaN values
assert not negative_train_data.isnull().values.any()
assert not negative_test_data.isnull().values.any()
assert not positive_train_data.isnull().values.any()
assert not positive_test_data.isnull().values.any()

In [20]:
def create_pipelines(target_columns, directions, trees, hidden_layer_sizes, max_iter_no_change, rf, mlp, max_iter=2000):
    # create a dictionary to store the pipelines
    pipelines = {}

    for i, target in enumerate(target_columns):
        
        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        for direction in directions:
                
            if rf[i]:
                # create random forest pipeline for the target and direction
                globals()[f'{direction}_{target_name}_pipeline_rf'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', RandomForestRegressor(n_estimators=trees[i])),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target_name}_pipeline_rf'] = globals()[f'{direction}_{target_name}_pipeline_rf']
            
            if mlp[i]:
                # create MLP pipeline for the target and direction
                globals()[f'{direction}_{target_name}_pipeline_mlp'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', MLPRegressor(max_iter=max_iter, hidden_layer_sizes=hidden_layer_sizes[i], verbose=True, n_iter_no_change=max_iter_no_change[i])),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target_name}_pipeline_mlp'] = globals()[f'{direction}_{target_name}_pipeline_mlp']

    return pipelines

In [21]:
def fit_pipelines(pipelines, train_data, target_columns, columns_not_to_use, directions, rf, mlp):

    #print('\n\nFitting pipelines...')
    # create a dictionary to store the fitted pipelines
    fitted_pipelines = {}

    for i, target in enumerate(target_columns):
        print(f'Fitting pipelines for {target}')
        
        if 'original' in target:
            target_name = target[:-len('_original')]
            target_name_columns = [col[:-len('_original')] for col in target_columns]
        else:
            target_name_columns = target_columns
            target_name = target

        for direction in directions:

            df = train_data[direction]['X'].copy()
            target_data = df[target]
            drop_columns = [col for col in columns_not_to_use if col not in target_name_columns[:i]]
            #drop_columns = columns_not_to_use

            df = df.drop(drop_columns, axis=1)
            
            # get the pipelines
            if rf[i]:
                pipeline_rf = pipelines[f'{direction}_{target_name}_pipeline_rf']

                pipeline_rf.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target_name}_pipeline_rf'] = pipeline_rf

                """
                # create a dataframe to store the feature importances
                feature_importances = pd.DataFrame({
                    'feature': df.columns,
                    'importance': pipeline_rf.named_steps['regressor'].feature_importances_
                })

                # sort the features by importance
                feature_importances = feature_importances.sort_values('importance', ascending=False)

                # print the feature importances
                #print(f'{direction}_{target}_pipeline_rf')
                #print(feature_importances)
                """

            if mlp[i]:
                pipeline_mlp = pipelines[f'{direction}_{target_name}_pipeline_mlp']
                
                # fit the pipeline
                pipeline_mlp.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp'] = pipeline_mlp

    return fitted_pipelines

In [22]:
def store_prediction_with_lags(df, mask, direction, target, prediction, train_columns):
    saved_target = target
    #print('target: ', saved_target)
    # find the column in which the prediction is stored
    for col in train_columns:
        if target == col:
            if 'original' in col:
                target = col[:-len('_original')]
                cols_to_insert_prediction = [target, col]
            else:
                cols_to_insert_prediction = [target]
        elif target in col:
            cols_to_insert_prediction = [col, target]
        elif col in target:
            target = col
            col = saved_target
            cols_to_insert_prediction = [col, target]
    
    
    #print('train columns: ', train_columns)
    #print('cols to insert prediction: ', cols_to_insert_prediction)
    #print('new target: ', target)

    prediction_array = np.array(df[saved_target])
    #print('length of prediction array: ', len(prediction_array))
    prediction_array[mask] = prediction
    #print('Nans in prediction array: ', np.isnan(prediction_array).sum())

    #print('prediction array: ', prediction_array[:5], prediction_array[-5:], '\n')

    # store the prediction array in the dataframe
    for col in cols_to_insert_prediction:
        df[col] = prediction_array
        #print('Nans in column: ', col, df[col].isnull().sum())
        
    lags = [1, 2]
    sign = '-'

    if direction == 'positive':
        sign = '+'
        lags = [-1, -2]

    # get the 'datetime' column for the masked data
    datetime = df['datetime']
    #print('datetime: ', datetime[:5], datetime[-5:], '\n')
    datetime_masked = df[mask]['datetime']

    for lag in lags:
        #print('lag: ', lag)
        # get the lagged 'datetime' column
        lagged_datetime = datetime_masked + pd.DateOffset(days=lag)
        #print('lagged datetime: ', lagged_datetime[:5], lagged_datetime[-5:], '\n')
        #print('datetime masked: ', datetime_masked[:5], datetime_masked[-5:], '\n')

        # get the mask for lagged time
        lagged_mask = lagged_datetime.isin(datetime)
        #print('lagged mask: ', lagged_mask[:5], lagged_mask[-5:])
        #print('total lagged mask: ', lagged_mask.sum(), '\n')

        # get the mask for the lagged 'datetime' column
        datetime_mask =  datetime.isin(lagged_datetime)
        #print('datetime mask: ', datetime_mask[:5], datetime_mask[-5:])
        #print('total datetime mask: ', datetime_mask.sum(), '\n')

        #print('subset of lagged datetime: ', lagged_datetime[lagged_mask][:5], lagged_datetime[lagged_mask][-5:], '\n')
        #print('subset of datetime: ', datetime[datetime_mask][:5], datetime[datetime_mask][-24:18], '\n')

        # assert the number of elements in the lagged mask is equal to the number of elements in the datetime mask
        assert lagged_mask.sum() == datetime_mask.sum()

        if lagged_mask.sum() > 0:
            
            #print('Inserting prediction for lag: ', lag, ' in coumns: ', cols_to_insert_prediction)
            prediction_to_store = prediction[lagged_mask]
            #print(f'Predictions inserted from datetime: {lagged_datetime[lagged_mask].iloc[0]} to {lagged_datetime[lagged_mask].iloc[-1]}')
            #print('prediction to store: ', prediction_to_store[:5], prediction_to_store[-5:], '\n')
            # insert the prediction into the dataframe subsetted by the lagged mask

            lagged_col = f'{target}, '+sign+str(abs(lag))
            #print('lagged col: ', lagged_col)
            df.loc[datetime_mask, lagged_col] = prediction_to_store

    return df

In [23]:
def predict_pipelines(fitted_pipelines, training_data, 
                        train_columns, test_columns, 
                        columns_not_to_use, test_period, 
                        maximum_day, directions, rf=True, mlp=False):
    
    print('\nPredicting pipelines...')

    # create a dictionary to store the predictions
    predictions = {}
    dataframes = {}

    df_positive = training_data['positive']['y'].copy()
    df_negative = training_data['negative']['y'].copy()
    

    for i, target in enumerate(test_columns):
        print(f'Predicting pipelines for {target}')

        if 'original' in target:
            target_name = target[:-len('_original')]
            target_name_columns = [col[:-len('_original')] for col in train_columns]
        else:
            target_name_columns = train_columns
            target_name = target

        for direction in directions:
            #print(f'Direction: {direction}')

            # get the y data
            if direction == 'positive':
                df = df_positive
            elif direction == 'negative':
                df = df_negative
            
            drop_columns = drop_columns = [col for col in columns_not_to_use if col not in target_name_columns[:i]]
            #drop_columns = columns_not_to_use

            # set the target column to NaN
            df[target] = np.nan
            #print('Initial nan values: ', df[target].isna().sum())

            if direction == 'negative':
                start_day = maximum_day - test_period
            elif direction == 'positive':
                start_day = test_period

            day = start_day
            days_predicted = 0

            while days_predicted < test_period:
                # mask the data to select all rows corresponding to a day of the month equal to start_day
                mask = df['datetime'].apply(lambda x: x.day == day)
                #print('Day: ', day)

                # get the data
                df_days = df[mask].copy()
                #print('Number of rows: ', df_days.shape[0])
                df_days = df_days.drop(drop_columns, axis=1)
                
                # get the pipeline
                if rf[i]:
                    pipeline1 = fitted_pipelines[f'{direction}_{target_name}_pipeline_rf']
                if mlp[i]:
                    pipeline2 = fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp']

                # take the mean of the predictions if both pipelines are used
                if rf[i] and mlp[i]:
                    prediction1 = pipeline1.predict(df_days)
                    prediction2 = pipeline2.predict(df_days)
                    prediction = (prediction1 + prediction2) / 2
                elif rf[i]:
                    prediction = pipeline1.predict(df_days)
                elif mlp[i]:
                    prediction = pipeline2.predict(df_days)

                # assert the lenght of the prediction is equal to the lenght of the mask
                #print('Length of prediction: ', len(prediction))
                assert len(prediction) == np.sum(mask)

                # store the prediction
                df = store_prediction_with_lags(df, mask, direction, target, prediction, train_columns)
            
                if direction == 'negative':
                    day += 1

                elif direction == 'positive':
                    day -= 1

                days_predicted += 1
                #print('current nan values: ', df[target].isna().sum())
                #print('')

            prediction = df[target]
            
            if prediction.isna().sum() > 0:
                # print the dates with missing values
                print('Dates with missing values: ', df[df[target].isna()]['datetime'])

            # assert there are no missing values
            assert not prediction.isna().values.any()
            #print('\n\n')
            # store the predictions
            predictions[f'{direction}_{target_name}'] = prediction

            # store the dataframe
            dataframes[f'{direction}_{target_name}'] = df

    return predictions, dataframes

In [24]:
def evaluate_pipelines(predictions, training_data, target_columns, directions):
    # create a dictionary to store the evaluation metrics
    evaluation = pd.DataFrame()

    for target in target_columns:

        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        for direction in directions:

            # get the y data
            y = training_data[direction]['y'][target]

            # get the predictions
            prediction = predictions[f'{direction}_{target_name}']

            # calculate the evaluation metrics
            mse = mean_squared_error(y, prediction)
            mae = mean_absolute_error(y, prediction)

            # save the evaluation metrics
            evaluation.loc[f'{direction}_{target_name}', 'mse'] = mse
            evaluation.loc[f'{direction}_{target_name}', 'mae'] = mae

    # print the evaluation metrics
    print(evaluation)

    return evaluation

In [25]:
def evaluate_pipelines2(predictions, training_data, target_columns, directions):

    # create a dictionary to store the evaluation metrics
    evaluation = pd.DataFrame()
    
    data = training_data.copy()

    for target in target_columns:
        print(f'Evaluating pipelines for {target}')

        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        for direction in directions:
            print(f'Direction: {direction}')

            # get the y data
            y = data[direction]['y'][target].copy()

            print(f'Mean of y: {y.mean()}')

            # get the predictions
            prediction = predictions[f'{direction}_{target_name}']
            print(f'Mean of prediction: {prediction.mean()}')

            # calculate the evaluation metrics
            mse = mean_squared_error(y, prediction)
            mae = mean_absolute_error(y, prediction)

            # save the evaluation metrics
            evaluation.loc[f'{direction}_{target_name}', 'mse'] = mse
            evaluation.loc[f'{direction}_{target_name}', 'mae'] = mae

            # subsitute the target column with the prediction
            data[direction]['y'][target] = prediction

            # assert the mse and mae are 0
            y = data[direction]['y'][target].copy()
            mse = mean_squared_error(y, prediction)
            mae = mean_absolute_error(y, prediction)
            assert mse == 0
            assert mae == 0

            print('')

    # print the evaluation metrics
    print(evaluation)
    return evaluation, data

## Predict Casual

In [26]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = ['casual', 'registered', 'count']
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [25, 50, 50]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 4
lag_period = 2
maximum_day = 20


train_columns = ['casual_original']
test_columns = ['casual_original']


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, training_data, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, training_data, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation, training_data_registered = evaluate_pipelines2(predictions, training_data, test_columns, directions)

Fitting pipelines for casual_original

Predicting pipelines...
Predicting pipelines for casual_original
Evaluating pipelines for casual_original
Direction: negative
Mean of y: 37.272965879265094
Mean of prediction: 25.906561679790027

Direction: positive
Mean of y: 36.62788671023965
Mean of prediction: 22.790309368191718

                         mse        mae
negative_casual  1361.708559  17.631741
positive_casual  1361.055647  17.716200


In [27]:
# put the predictions into the training data
"""
for direction in directions:
    for target in test_columns:

        saved_target = target
        
        if 'original' in target:
            target = target[:-len('_original')]
        
        df = training_data[direction]['y'].copy()

        # find the key of the prediction
        for key in predictions.keys():
            if direction in key and target in key:
                saved_key = key

        # get the prediction
        prediction = predictions[saved_key]

        # calculate the mse and mae
        mse = mean_squared_error(df[target], prediction)
        mae = mean_absolute_error(df[target], prediction)
        
        print(f'{direction}_{target} mse: {mse}')
        print(f'{direction}_{target} mae: {mae}')

        # store the prediction
        df[target] = prediction

        # store the dataframe
        training_data[direction]['y'] = df
"""

"\nfor direction in directions:\n    for target in test_columns:\n\n        saved_target = target\n        \n        if 'original' in target:\n            target = target[:-len('_original')]\n        \n        df = training_data[direction]['y'].copy()\n\n        #\xa0find the key of the prediction\n        for key in predictions.keys():\n            if direction in key and target in key:\n                saved_key = key\n\n        #\xa0get the prediction\n        prediction = predictions[saved_key]\n\n        # calculate the mse and mae\n        mse = mean_squared_error(df[target], prediction)\n        mae = mean_absolute_error(df[target], prediction)\n        \n        print(f'{direction}_{target} mse: {mse}')\n        print(f'{direction}_{target} mae: {mae}')\n\n        #\xa0store the prediction\n        df[target] = prediction\n\n        #\xa0store the dataframe\n        training_data[direction]['y'] = df\n"

## Predict Registered

In [28]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = ['registered', 'count']
original_columns = ['registered_original', 'count_original', 'casual_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [50, 50, 50]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 4
lag_period = 2
maximum_day = 20

train_columns = ['casual_original', 'registered_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, training_data_registered, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, training_data_registered, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation, triaining_data_count = evaluate_pipelines2(predictions, training_data_registered, test_columns, directions)

Fitting pipelines for casual_original


KeyboardInterrupt: 

# Predict count

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = ['count']
original_columns = ['registered_original', 'count_original', 'casual_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [50, 50, 50]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 4
lag_period = 2
maximum_day = 20

train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)


# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation, triaining_data_count2 = evaluate_pipelines2(predictions, triaining_data_count, test_columns, directions)

Fitting pipelines for casual_original
Fitting pipelines for registered_original
Fitting pipelines for count_original

Predicting pipelines...
Predicting pipelines for casual_original
Predicting pipelines for registered_original
Predicting pipelines for count_original
Evaluating pipelines for casual_original
Direction: negative
Mean of y: 9.433035870516184
Mean of prediction: 9.484663167104111

Direction: positive
Mean of y: 10.896165577342048
Mean of prediction: 11.124331154684095

Evaluating pipelines for registered_original
Direction: negative
Mean of y: 82.34912510936132
Mean of prediction: 44.67061242344707

Direction: positive
Mean of y: 93.62872331154684
Mean of prediction: 43.26994335511982

Evaluating pipelines for count_original
Direction: negative
Mean of y: 195.501312335958
Mean of prediction: 17.48771653543307

Direction: positive
Mean of y: 188.6575163398693
Mean of prediction: 16.892941176470586

                              mse         mae
negative_casual          0.716

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = []
original_columns = ['registered_original', 'count_original', 'casual_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [50, 50, 50]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 4
lag_period = 2
maximum_day = 20

train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count2, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count2, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation, triaining_data_count3 = evaluate_pipelines2(predictions, triaining_data_count2, test_columns, directions)

Fitting pipelines for casual_original
Fitting pipelines for registered_original
Fitting pipelines for count_original

Predicting pipelines...
Predicting pipelines for casual_original
Predicting pipelines for registered_original
Predicting pipelines for count_original
Evaluating pipelines for casual_original
Direction: negative
Mean of y: 9.484663167104111
Mean of prediction: 9.560034995625546

Direction: positive
Mean of y: 11.124331154684095
Mean of prediction: 11.113673202614377

Evaluating pipelines for registered_original
Direction: negative
Mean of y: 44.67061242344707
Mean of prediction: 68.79012248468943

Direction: positive
Mean of y: 43.26994335511982
Mean of prediction: 62.652514161220054

Evaluating pipelines for count_original
Direction: negative
Mean of y: 17.48771653543307
Mean of prediction: 52.95359580052494

Direction: positive
Mean of y: 16.892941176470586
Mean of prediction: 52.89325490196078

                             mse        mae
negative_casual         0.4952

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = []
original_columns = ['registered_original', 'count_original', 'casual_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [True, True, True]
trees = [50, 50, 50]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 4
lag_period = 2
maximum_day = 20

train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count3, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count3, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
evaluation, triaining_data_count4 = evaluate_pipelines2(predictions, triaining_data_count3, test_columns, directions)

Fitting pipelines for casual_original
Iteration 1, loss = 1515.71950226
Iteration 2, loss = 378.54602594
Iteration 3, loss = 169.93477080
Iteration 4, loss = 88.50403649
Iteration 5, loss = 57.77562759
Iteration 6, loss = 49.76716620
Iteration 7, loss = 41.30608738
Iteration 8, loss = 37.82606256
Iteration 9, loss = 35.03817171
Iteration 10, loss = 33.64426181
Iteration 11, loss = 32.15309476
Iteration 12, loss = 31.39154987
Iteration 13, loss = 30.80423806
Iteration 14, loss = 30.26691577
Iteration 15, loss = 29.62264900
Iteration 16, loss = 28.85791981
Iteration 17, loss = 29.10130329
Iteration 18, loss = 28.68707639




Iteration 1, loss = 1408.97681063
Iteration 2, loss = 381.90818877
Iteration 3, loss = 185.77418445
Iteration 4, loss = 96.52703790
Iteration 5, loss = 59.47654574
Iteration 6, loss = 45.82443427
Iteration 7, loss = 39.74118483
Iteration 8, loss = 36.11733120
Iteration 9, loss = 34.20884750
Iteration 10, loss = 33.83678625
Iteration 11, loss = 31.78521374
Iteration 12, loss = 31.28817444
Iteration 13, loss = 30.64297642
Iteration 14, loss = 30.02090689
Iteration 15, loss = 29.46912304
Iteration 16, loss = 29.09051181
Iteration 17, loss = 28.55743750
Iteration 18, loss = 28.66613423
Iteration 19, loss = 28.25934011
Iteration 20, loss = 27.74353709
Iteration 21, loss = 28.58940348
Iteration 22, loss = 28.18600182
Iteration 23, loss = 27.47133133
Iteration 24, loss = 27.63400919
Iteration 25, loss = 27.62702478
Iteration 26, loss = 26.89816492
Iteration 27, loss = 27.13628337
Iteration 28, loss = 27.09675909
Iteration 29, loss = 27.02964138
Iteration 30, loss = 26.99039582
Iteration 31, l

In [None]:
# print count predictions
print(predictions['negative_count'])

348      34.98
349      21.86
350      15.92
351       8.84
352       3.64
         ...  
17088     7.60
17089     7.64
17090     7.60
17091     7.42
17092     6.48
Name: count_original, Length: 2286, dtype: float64
