In [874]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA

In [875]:
# open final data
data = pd.read_csv('../../data/processed/final_data.csv')

# get the list of columns
columns = data.columns
positive_columns = [col for col in columns if '+' in col]
negative_columns = [col for col in columns if '-' in col]
original_columns = [col for col in columns if '+' not in col and '-' not in col]

In [876]:
# use only the training data
model_training_data = data[data['dataset'] == 'train']
model_testing_data = data[data['dataset'] == 'test']

# convert the datetime columns to datetime
model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])
model_testing_data['datetime'] = pd.to_datetime(model_testing_data['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_training_data['datetime'] = pd.to_datetime(model_training_data['datetime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_testing_data['datetime'] = pd.to_datetime(model_testing_data['datetime'])


In [877]:
print(model_testing_data.isnull().sum())

datetime            0
season              0
holiday             0
workingday          0
weather             0
                 ... 
casual, -2        238
registered, +2    286
registered, -2    238
count, +2         286
count, -2         238
Length: 63, dtype: int64


In [878]:
test_period = 0
lag_period = 2
maximum_day = 20

# mask the data from day 3 to 15 of each month using 'datetime' column
negative_train_mask = model_training_data['datetime'].apply(lambda x: x.day > lag_period and x.day < maximum_day - test_period)

# get the negative training data
negative_train_data = model_training_data[negative_train_mask][original_columns + negative_columns].copy()

# maske the data from day 5 to 17 of each month using 'datetime' column
positive_train_mask = model_training_data['datetime'].apply(lambda x: x.day > test_period and x.day < maximum_day - lag_period)

# get the positive training data
positive_train_data = model_training_data[positive_train_mask][original_columns + positive_columns].copy()

# save the data into a dictionary
training_data = {
    'positive': {
        'X': positive_train_data,
        'y': model_testing_data[original_columns + positive_columns]
        },
    'negative': {
        'X': negative_train_data,
        'y': model_testing_data[original_columns + negative_columns]
        }
    }

In [879]:
training_data['negative']['X'].columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'dataset', 'count_original',
       'registered_original', 'casual_original', 'dayofyear', 'weekofyear',
       'dayofweek', 'windspeed, -1', 'atemp, -1', 'humidity, -1',
       'holiday, -1', 'workingday, -1', 'weather, -1', 'temp, -1',
       'casual, -1', 'registered, -1', 'count, -1', 'windspeed, -2',
       'atemp, -2', 'humidity, -2', 'holiday, -2', 'workingday, -2',
       'weather, -2', 'temp, -2', 'casual, -2', 'registered, -2', 'count, -2'],
      dtype='object')

In [880]:
# for each train or test assert there are no NaN values
assert not training_data['positive']['X'].isna().values.any()
assert not training_data['negative']['X'].isna().values.any()


In [881]:
def create_pipelines(target_columns, directions, trees, hidden_layer_sizes, max_iter_no_change, rf, mlp, max_iter=2000):
    # create a dictionary to store the pipelines
    pipelines = {}

    for i, target in enumerate(target_columns):
        
        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        for direction in directions:
                
            if rf[i]:
                # create random forest pipeline for the target and direction
                globals()[f'{direction}_{target_name}_pipeline_rf'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', RandomForestRegressor(n_estimators=trees[i])),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target_name}_pipeline_rf'] = globals()[f'{direction}_{target_name}_pipeline_rf']
            
            if mlp[i]:
                # create MLP pipeline for the target and direction
                globals()[f'{direction}_{target_name}_pipeline_mlp'] = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', MLPRegressor(max_iter=max_iter, hidden_layer_sizes=hidden_layer_sizes[i], verbose=True, n_iter_no_change=max_iter_no_change[i])),
                ])

                # save the pipeline
                pipelines[f'{direction}_{target_name}_pipeline_mlp'] = globals()[f'{direction}_{target_name}_pipeline_mlp']

    return pipelines

In [882]:
def fit_pipelines(pipelines, train_data, target_columns, columns_not_to_use, directions, rf, mlp):

    #print('\n\nFitting pipelines...')
    # create a dictionary to store the fitted pipelines
    fitted_pipelines = {}

    for i, target in enumerate(target_columns):
        print(f'Fitting pipelines for {target}')
        
        if 'original' in target:
            target_name = target[:-len('_original')]
            target_name_columns = [col[:-len('_original')] for col in target_columns]
        else:
            target_name_columns = target_columns
            target_name = target

        print(f'Maximum of the target: {train_data['positive']['X'][target].max()}')
        print(f'Avergae of the target: {train_data['positive']['X'][target].mean()}')
        print('')

        for direction in directions:

            df = train_data[direction]['X'].copy()
            target_data = df[target].copy()
            #drop_columns = [col for col in columns_not_to_use if col not in target_name_columns[:i]]
            drop_columns = columns_not_to_use
            

            df = df.drop(drop_columns, axis=1)
            
            #print(f'Columns used: {df.columns}')
            
            # get the pipelines
            if rf[i]:
                pipeline_rf = pipelines[f'{direction}_{target_name}_pipeline_rf']
                pipeline_rf.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target_name}_pipeline_rf'] = pipeline_rf

                
                # create a dataframe to store the feature importances
                feature_importances = pd.DataFrame({
                    'feature': df.columns,
                    'importance': pipeline_rf.named_steps['regressor'].feature_importances_
                })

                # sort the features by importance
                feature_importances = feature_importances.sort_values('importance', ascending=False)

                # print the feature importances
                #print(f'{direction}_{target}_pipeline_rf')
                #print(feature_importances)
                

            if mlp[i]:
                pipeline_mlp = pipelines[f'{direction}_{target_name}_pipeline_mlp']
                
                # fit the pipeline
                pipeline_mlp.fit(df, target_data)

                # save the fitted pipeline
                fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp'] = pipeline_mlp

    return fitted_pipelines

In [883]:
def store_prediction_with_lags2(df, mask, direction, target, prediction, train_columns):

    initial_length = len(df)

    if 'original' in target:
        target_name = target[:-len('_original')]
        target_name_columns = [col[:-len('_original')] for col in train_columns]

    else:
        target_name = target
        target_name_columns = train_columns

    data_to_store = df.copy()
    # find the unique days in the data
    days = data_to_store['datetime'][mask].dt.day.unique()
    assert len(days) == 1
    day = days[0]

    prediction_to_insert = np.array(data_to_store[target_name])
    prediction_to_insert[mask] = prediction

    data_to_store.loc[:, target_name] = prediction_to_insert

    assert not data_to_store[mask][target_name].isna().any()

    lags = [1, 2]
    sign = '-' if direction == 'negative' else '+'


    for lag in lags:

        time_delta = pd.Timedelta(days=lag) if direction == 'negative' else pd.Timedelta(days=-lag)
        lagged_dates = data_to_store['datetime'][mask] + time_delta
        column_name = f'{target_name}, {sign}{lag}'


        # create a pandas df
        lagged_data = pd.DataFrame({
            'datetime': lagged_dates,
            'new_col': prediction
        })

        # merge the data with the lagged data using datetime as index, if the datetime is not in the data, do not add it
        data_to_store = data_to_store.merge(lagged_data, on='datetime', how='left')

        missing_mask = data_to_store[column_name].isna()
        new_values = ~ data_to_store['new_col'].isna()

        merge_mask = missing_mask & new_values

        #print('Dates to merge:', data_to_store['datetime'][merge_mask])

        data_to_store.loc[merge_mask, column_name] = data_to_store['new_col'][merge_mask].copy()
        data_to_store = data_to_store.drop(columns = ['new_col'])
        
        new_day = day + lag if direction == 'negative' else day - lag
        new_day_mask = data_to_store['datetime'].dt.day == new_day

        if data_to_store[column_name][new_day_mask].isna().sum() > 0:
            #print(f'Missing values: ', data_to_store[column_name][new_day_mask].isna().sum())
            interpolated_values = data_to_store[column_name][new_day_mask].interpolate(method='linear')
            data_to_store.loc[new_day_mask, column_name] = interpolated_values

        assert data_to_store[column_name][new_day_mask].isna().sum() == 0
        assert len(data_to_store) == initial_length

        columns_to_print = [col for col in data_to_store.columns if 'casual' in col]
        columns_to_print.append('datetime')
        
        indices_top_print = np.arange(6327, 6331)

        data_to_store = data_to_store.reset_index(drop = True)

    # reset the index
    data_to_store = data_to_store.reset_index(drop = True)

    assert len(data_to_store) == initial_length

    return data_to_store

In [884]:
def store_prediction_with_lags(df, mask, direction, target, prediction, train_columns):
    saved_target = target
    #print('target: ', saved_target)
    # find the column in which the prediction is stored
    for col in train_columns:
        if target == col:
            if 'original' in col:
                target = col[:-len('_original')]
                cols_to_insert_prediction = [target]
            else:
                cols_to_insert_prediction = [target]
        elif target in col:
            cols_to_insert_prediction = [target]
        elif col in target:
            target = col
            col = saved_target
            cols_to_insert_prediction = [col]
    
    
    #print('train columns: ', train_columns)
    #print('cols to insert prediction: ', cols_to_insert_prediction)
    #print('new target: ', target)

    prediction_array = np.array(df[saved_target])
    #print('length of prediction array: ', len(prediction_array))
    prediction_array[mask] = prediction
    #print('Nans in prediction array: ', np.isnan(prediction_array).sum())

    #print('prediction array: ', prediction_array[:5], prediction_array[-5:], '\n')

    # store the prediction array in the dataframe
    for col in cols_to_insert_prediction:
        df[col] = prediction_array
        #print('Nans in column: ', col, df[col].isnull().sum())
        
    lags = [1, 2]
    sign = '-'

    if direction == 'positive':
        sign = '+'
        lags = [-1, -2]

    # get the 'datetime' column for the masked data
    datetime = df['datetime']
    #print('datetime: ', datetime[:5], datetime[-5:], '\n')
    datetime_masked = df[mask]['datetime']

    for lag in lags:
        #print('lag: ', lag)
        # get the lagged 'datetime' column
        lagged_datetime = datetime_masked + pd.DateOffset(days=lag)
        print('lagged datetime: ', lagged_datetime[:5], lagged_datetime[-5:], '\n')
        print('datetime masked: ', datetime_masked[:5], datetime_masked[-5:], '\n')

        # get the mask for lagged time
        lagged_mask = lagged_datetime.isin(datetime)
        #print('lagged mask: ', lagged_mask[:5], lagged_mask[-5:])
        #print('total lagged mask: ', lagged_mask.sum(), '\n')

        # get the mask for the lagged 'datetime' column
        datetime_mask =  datetime.isin(lagged_datetime)
        #print('datetime mask: ', datetime_mask[:5], datetime_mask[-5:])
        #print('total datetime mask: ', datetime_mask.sum(), '\n')

        #print('subset of lagged datetime: ', lagged_datetime[lagged_mask][:5], lagged_datetime[lagged_mask][-5:], '\n')
        #print('subset of datetime: ', datetime[datetime_mask][:5], datetime[datetime_mask][-24:18], '\n')

        # assert the number of elements in the lagged mask is equal to the number of elements in the datetime mask
        assert lagged_mask.sum() == datetime_mask.sum()

        if lagged_mask.sum() > 0:
            
            print('Inserting prediction for lag: ', lag, ' in coumns: ', cols_to_insert_prediction)
            prediction_to_store = prediction[lagged_mask]
            #print(f'Predictions inserted from datetime: {lagged_datetime[lagged_mask].iloc[0]} to {lagged_datetime[lagged_mask].iloc[-1]}')
            print('prediction to store: ', prediction_to_store[:5], prediction_to_store[-5:], '\n')
            # insert the prediction into the dataframe subsetted by the lagged mask

            lagged_col = f'{target}, '+sign+str(abs(lag))

            #print('lagged col: ', lagged_col)
            # print df.loc with datetime column too
            #print(df.loc[datetime_mask, [col for col in df.columns if 'datetime' in col or target in col]])
            len_prediction = len(prediction_to_store)
            len_space_to_insert = len(df.loc[datetime_mask, lagged_col])

            # assert the length of the prediction to store is equal to the length of the space to insert
            assert len_prediction == len_space_to_insert
            
            df.loc[datetime_mask, lagged_col] = prediction_to_store
            #print(df.loc[datetime_mask, [col for col in df.columns if 'datetime' in col or target in col]])

            sample_index = 17215
            #print(df.loc[sample_index, [col for col in df.columns if 'datetime' in col or target in col]])

            assert df.loc[datetime_mask, lagged_col].isnull().sum() == 0
            print('\n\n')

    return df

In [885]:
def predict_pipelines(fitted_pipelines, training_data, 
                        train_columns, test_columns, 
                        columns_not_to_use, test_period, 
                        maximum_day, directions, rf=True, mlp=False):
    
    print('\nPredicting pipelines...')

    # create a dictionary to store the predictions
    predictions = {}
    dataframes = {}

    for direction in directions:

        for i, target in enumerate(test_columns):
            if 'original' in target:
                target_name = target[:-len('_original')]
                target_name_columns = [col[:-len('_original')] for col in train_columns]
            else:
                target_name_columns = train_columns
                target_name = target

        df = training_data[direction]['y'].copy()
        #drop_columns =  [col for col in columns_not_to_use if col not in target_name_columns[:i]]
        drop_columns = [col for col in columns_not_to_use if col != 'datetime']

        # set the target column to NaN
        df[target] = np.nan
        #print('Initial nan values: ', df[target].isna().sum())

        if direction == 'negative':
            start_day = 20
        elif direction == 'positive':
            start_day = 31

        day = start_day
        days_predicted = 0

        while days_predicted < test_period:

            # mask the data to select all rows corresponding to a day of the month equal to start_day
            mask = df['datetime'].apply(lambda x: x.day == day)

            # convert the mask to a numpy array
            mask = mask.values

            # get the data
            df_days = df[mask]

            #print('Number of rows: ', df_days.shape[0])
            df_days = df_days.drop(drop_columns, axis=1)

            #assert 'datetime' not in drop_columns
            
            #df_days.interpolate(method='linear', axis=0, inplace=True)
            #print('Day: ', day)
            
            """try:
                assert not df_days.isna().values.any()
            except:
                # print the rows and columns with NaN values
                subset = df_days[df_days.isna().any(axis=1)]
                columns_with_nan = subset.columns[subset.isna().any()].tolist()

                print(f'subset: {subset[columns_with_nan + ['datetime']]}')
                raise ValueError('There are NaN values in the data')"""

            
            df_days = df_days.drop(columns = ['datetime'])
    
            #print('Columns used: ', df_days.columns)
            #print(f'Missing values: {df_days.isna().sum()}')

            for i, target in enumerate(test_columns):
                #print(target)
                
                if day == start_day:
                    df[target] = np.nan
                
                if 'original' in target:
                    target_name = target[:-len('_original')]
                    target_name_columns = [col[:-len('_original')] for col in train_columns]
                else:
                    target_name_columns = train_columns
                    target_name = target

                if rf[i]:
                    pipeline1 = fitted_pipelines[f'{direction}_{target_name}_pipeline_rf']
                if mlp[i]:
                    pipeline2 = fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp']

                # take the mean of the predictions if both pipelines are used
                if rf[i] and mlp[i]:
                    prediction1 = pipeline1.predict(df_days)
                    prediction2 = pipeline2.predict(df_days)

                    # use target statistics to scale the predictions
                    prediction = (prediction1 + prediction2) / 2
                elif rf[i]:
                    prediction = pipeline1.predict(df_days)
                elif mlp[i]:
                    prediction = pipeline2.predict(df_days)

                df = store_prediction_with_lags2(df, mask, direction, target, prediction, train_columns)

            #print('')

            if direction == 'negative':
                day += 1
            elif direction == 'positive':
                day -= 1
            days_predicted += 1

        for col in target_name_columns:
            prediction = df[col].copy()

            original_col = col + '_original'
            original_data = training_data[direction]['X'][original_col].copy()

            original_mean = original_data.mean()
            original_std = original_data.std()

            standardised_prediction = (prediction - prediction.mean()) / prediction.std()
            prediction = standardised_prediction * original_std + original_mean

            df.loc[:, col] = prediction

            assert not prediction.isna().any()
            predictions[f'{direction}_{col}'] = prediction

        dataframes[direction] = df

    return predictions, dataframes

In [886]:
"""def predict_pipelines(fitted_pipelines, training_data, 
                        train_columns, test_columns, 
                        columns_not_to_use, test_period, 
                        maximum_day, directions, rf=True, mlp=False):
    
    print('\nPredicting pipelines...')

    # create a dictionary to store the predictions
    predictions = {}
    dataframes = {}

    df_positive = training_data['positive']['y'].copy()
    df_negative = training_data['negative']['y'].copy()
    

    for i, target in enumerate(test_columns):
        print(f'Predicting pipelines for {target}')

        if 'original' in target:
            target_name = target[:-len('_original')]
            target_name_columns = [col[:-len('_original')] for col in train_columns]
        else:
            target_name_columns = train_columns
            target_name = target
        for direction in directions:
            #print(f'Direction: {direction}')

            # get the y data
            if direction == 'positive':
                df = df_positive
            elif direction == 'negative':
                df = df_negative
            
            drop_columns = drop_columns = [col for col in columns_not_to_use if col not in target_name_columns[:i]]
            drop_columns = columns_not_to_use

            # set the target column to NaN
            df[target] = np.nan
            #print('Initial nan values: ', df[target].isna().sum())

            if direction == 'negative':
                start_day = 20
            elif direction == 'positive':
                start_day = 31

            day = start_day
            days_predicted = 0

            while days_predicted < test_period:
                # mask the data to select all rows corresponding to a day of the month equal to start_day
                mask = df['datetime'].apply(lambda x: x.day == day)
                print('Day: ', day)

                # get the data
                df_days = df[mask].copy()
                #print('Number of rows: ', df_days.shape[0])
                df_days = df_days.drop(drop_columns, axis=1)
                #print('Columns used: ', df_days.columns)

                print(f'Missing values: {df_days.isna().sum()}')
                assert not df_days.isna().values.any()
                
                # get the pipeline
                if rf[i]:
                    pipeline1 = fitted_pipelines[f'{direction}_{target_name}_pipeline_rf']
                if mlp[i]:
                    pipeline2 = fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp']

                # take the mean of the predictions if both pipelines are used
                if rf[i] and mlp[i]:
                    prediction1 = pipeline1.predict(df_days)
                    prediction2 = pipeline2.predict(df_days)

                    # use target statistics to scale the predictions
                    prediction = (prediction1 + prediction2) / 2
                elif rf[i]:
                    prediction = pipeline1.predict(df_days)
                elif mlp[i]:
                    prediction = pipeline2.predict(df_days)

                # assert the lenght of the prediction is equal to the lenght of the mask
                #print('Length of prediction: ', len(prediction))
                assert len(prediction) == np.sum(mask)

                # store the prediction
                df = store_prediction_with_lags(df, mask, direction, target, prediction, train_columns)
            
                if direction == 'negative':
                    day += 1

                elif direction == 'positive':
                    day -= 1

                days_predicted += 1
                #print('current nan values: ', df[target].isna().sum())
                #print('')

            prediction = df[target]
            
            if prediction.isna().sum() > 0:
                # print the dates with missing values
                print('Dates with missing values: ', df[df[target].isna()]['datetime'])

            # assert there are no missing values
            assert not prediction.isna().values.any()
            #print('\n\n')
            # store the predictions
            predictions[f'{direction}_{target_name}'] = prediction

            # store the dataframe
            dataframes[f'{direction}_{target_name}'] = df

    return predictions, dataframes"""

"def predict_pipelines(fitted_pipelines, training_data, \n                        train_columns, test_columns, \n                        columns_not_to_use, test_period, \n                        maximum_day, directions, rf=True, mlp=False):\n    \n    print('\nPredicting pipelines...')\n\n    #\xa0create a dictionary to store the predictions\n    predictions = {}\n    dataframes = {}\n\n    df_positive = training_data['positive']['y'].copy()\n    df_negative = training_data['negative']['y'].copy()\n    \n\n    for i, target in enumerate(test_columns):\n        print(f'Predicting pipelines for {target}')\n\n        if 'original' in target:\n            target_name = target[:-len('_original')]\n            target_name_columns = [col[:-len('_original')] for col in train_columns]\n        else:\n            target_name_columns = train_columns\n            target_name = target\n        for direction in directions:\n            #print(f'Direction: {direction}')\n\n            # get the y da

In [887]:
def substitute_training_columns(fitted_pipelines, training_data, train_columns, 
                                test_columns, columns_not_to_use, directions, rf, mlp):
    print('\nSubstituting training columns...')

    df_positive = training_data['positive']['X'].copy()
    df_negative = training_data['negative']['X'].copy()

    for i, target in enumerate(test_columns):
        print(f'Substituting training columns for {target}')

        if 'original' in target:
            target_name = target[:-len('_original')]
            target_name_columns = [col[:-len('_original')] for col in train_columns]
        else:
            target_name_columns = train_columns
            target_name = target

        for direction in directions:

            if direction == 'positive':
                df = df_positive.copy()
            elif direction == 'negative':
                df = df_negative.copy()

            drop_columns = [col for col in columns_not_to_use if col not in target_name_columns[:i]]
            drop_columns = columns_not_to_use

            # get the pipeline
            if rf[i]:
                pipeline1 = fitted_pipelines[f'{direction}_{target_name}_pipeline_rf']
            if mlp[i]:
                pipeline2 = fitted_pipelines[f'{direction}_{target_name}_pipeline_mlp']

            # get the data
            df = df.drop(drop_columns, axis=1)

            if rf[i] and mlp[i]:
                prediction1 = pipeline1.predict(df)
                prediction2 = pipeline2.predict(df)

                prediction = (prediction1 + prediction2) / 2

            elif rf[i]:
                prediction = pipeline1.predict(df)
            elif mlp[i]:
                prediction = pipeline2.predict(df)

            print(f'Maximum of the prediction: {prediction.max()}')
            print(f'Avergae of the prediction: {prediction.mean()}')

            # store the prediction in the df_positive or df_negative
            if direction == 'positive':
                df_positive[target_name] = prediction
            elif direction == 'negative':
                df_negative[target_name] = prediction
    
    # put everything back into the training data
    training_data['positive']['X'] = df_positive
    training_data['negative']['X'] = df_negative

    return training_data

In [888]:
def merge_predictions(predictions, training_data, target_columns, directions):

    print('\nMerging predictions...')
    
    data = training_data.copy()

    for target in target_columns:

        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        target_prediction = np.zeros(data['positive']['y'].shape[0])

        for i, direction in enumerate(directions):
            print(f'Merging predictions for {direction}_{target_name}')

            # get the predictions
            prediction = predictions[f'{direction}_{target_name}']
            print('maximum value: ', prediction.max())
            print('average value: ', prediction.mean())
            #print('prediction: ', prediction[:5], prediction[-5:], '\n')

            # plot the prediction
            #plt.figure(figsize=(10, 5))
            #plt.plot(prediction)
            #plt.title(f'{direction}_{target_name}')
            #plt.show()
            #plt.pause(0.1)

            df = data[direction]['y'].copy()
            print('shape of df: ', df.shape)

            # find all unique months in the 'datetime' column, with same months in different years counting as different months
            df['month_year'] = list(zip(df['datetime'].dt.year, df['datetime'].dt.month))

            months = df['month_year'].unique()

            weights = np.zeros(df.shape[0])
            
            for month in months:
                # mask the data to select all rows corresponding to a month
                mask = df['month_year'] == month
                if i == 0:
                    weights[mask] = np.linspace(1, 0, np.sum(mask))
                else:
                    weights[mask] = np.linspace(0, 1,  np.sum(mask))

            #print('weights: ', weights[:5], weights[-5:], '\n')
            #print('shape of prediction: ', prediction.shape)
            print('shape of total prediction: ', target_prediction.shape)
            print('shape of weights: ', weights.shape)
            
            # apply the weights to the prediction
            target_prediction += prediction * weights

        # plot the target prediction
        #plt.figure(figsize=(10, 5))
        #plt.plot(target_prediction)
        #plt.title(f'{target_name}')
        #plt.show()
        #plt.pause(0.1)

        print('Maximum of the target prediction: ', target_prediction.max())
        print('Average of the target prediction: ', target_prediction.mean())
        print('')
        #print('target prediction: ', target_prediction[:5], target_prediction[-5:], '\n\n')
        # subsitute the target column with the prediction
        for direction in directions:
            # insert the prediction into the dataframe
            #print('target prediction type: ', type(target_prediction))
            data[direction]['y'].loc[:, target_name] = np.copy(target_prediction)
            #print('Nans in the target prediction: ', np.sum(target_prediction == pd.NA))
            
            #data[direction]['y'][target] = target_prediction
            # assert there are no missing values
            #print('Nans in the target prediction: ', data[direction]['y'][target_name].isna().sum())
            # print the dates with missing values
            #print('Dates with missing values: ', data[direction]['y'][target_name][data[direction]['y'][target_name].isna()])
            assert not data[direction]['y'][target_name].isna().values.any()

    # print the evaluation metrics
    return data

In [889]:
def evaluate_pipelines2(predictions, training_data, target_columns, directions):

    data = training_data.copy()

    for target in target_columns:

        if 'original' in target:
            target_name = target[:-len('_original')]
        else:
            target_name = target

        for direction in directions:

            # get the y data
            y = data[direction]['y'][target].copy()

            # get the predictions
            prediction = predictions[f'{direction}_{target_name}']

            # subsitute the target column with the prediction
            data[direction]['y'][target] = prediction

    return data

## Predict Casual

In [890]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month', 'weather', 'year']
smoothed_columns = ['casual', 'registered', 'count']
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
n = 20
trees = [n, n, n]
max_iter = 2000
hidden_layer_sizes = [(150, 150, 150), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]
test_period = 12
lag_period = 2
maximum_day = 20


train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, training_data, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, training_data, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
training_data_registered = merge_predictions(predictions, training_data, test_columns, directions)

# TODO: solve the prediction order doing for the first day all features, for the second day all features 

Fitting pipelines for casual_original
Maximum of the target: 367.0
Avergae of the target: 36.023899887167914



## Predict Registered

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = ['registered', 'count']
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [n, n, n]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]


train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, training_data_registered, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, training_data_registered, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
triaining_data_count = merge_predictions(predictions, training_data_registered, test_columns, directions)

Fitting pipelines for casual_original
Maximum of the target: 367.0
Avergae of the target: 36.023899887167914

Fitting pipelines for registered_original
Maximum of the target: 886.0
Avergae of the target: 155.4472253564468

Fitting pipelines for count_original
Maximum of the target: 977.0
Avergae of the target: 191.47112524361472


Predicting pipelines...

Merging predictions...
Merging predictions for negative_casual
maximum value:  600.1718074440588
average value:  36.3763606490039
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Merging predictions for positive_casual
maximum value:  590.8011794270951
average value:  36.023899887167914
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Maximum of the target prediction:  591.062381601784
Average of the target prediction:  36.164151759792425

Merging predictions for negative_registered
maximum value:  1566.677395816826
average value:  156.31238447319782
shape o

# Predict count

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = ['count']
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [n, n, n]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]


train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)


# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
triaining_data_count2 = merge_predictions(predictions, triaining_data_count, test_columns, directions)

Fitting pipelines for casual_original
Maximum of the target: 367.0
Avergae of the target: 36.023899887167914

Fitting pipelines for registered_original
Maximum of the target: 886.0
Avergae of the target: 155.4472253564468

Fitting pipelines for count_original
Maximum of the target: 977.0
Avergae of the target: 191.47112524361472


Predicting pipelines...

Merging predictions...
Merging predictions for negative_casual
maximum value:  595.1910871489043
average value:  36.3763606490039
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Merging predictions for positive_casual
maximum value:  597.9489203906179
average value:  36.023899887167914
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Maximum of the target prediction:  597.6758036063699
Average of the target prediction:  36.20944562978634

Merging predictions for negative_registered
maximum value:  1628.4128975726244
average value:  156.31238447319782
shape 

In [None]:
# extract the predictions for count from the training data
count_predictions = triaining_data_count2['positive']['y']['count']

# save the predictions into a csv file with the datetime column
count_predictions = pd.concat([triaining_data_count2['positive']['y']['datetime'], count_predictions], axis=1)

count_predictions.to_csv('../../data/processed/count_predictions1.csv', index=False)

In [None]:
columns_not_to_use = ['datetime', 'dataset', 'day', 'month']
smoothed_columns = []
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [n, n, n]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]


train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count2, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count2, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
triaining_data_count3 = merge_predictions(predictions, triaining_data_count2, test_columns, directions)

Fitting pipelines for casual_original
Maximum of the target: 367.0
Avergae of the target: 36.023899887167914

Fitting pipelines for registered_original
Maximum of the target: 886.0
Avergae of the target: 155.4472253564468

Fitting pipelines for count_original
Maximum of the target: 977.0
Avergae of the target: 191.47112524361472


Predicting pipelines...

Merging predictions...
Merging predictions for negative_casual
maximum value:  616.8108209010074
average value:  36.37636064900389
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Merging predictions for positive_casual
maximum value:  605.615059169547
average value:  36.02389988716791
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Maximum of the target prediction:  604.3006003738587
Average of the target prediction:  36.097107704135894

Merging predictions for negative_registered
maximum value:  1963.1388748866925
average value:  156.31238447319777
shape 

In [None]:
# extract the predictions for count from the training data
count_predictions = triaining_data_count3['positive']['y']['count']

# save the predictions into a csv file with the datetime column
count_predictions = pd.concat([triaining_data_count3['positive']['y']['datetime'], count_predictions], axis=1)

count_predictions.to_csv('../../data/processed/count_predictions2.csv', index=False)

In [None]:
columns_not_to_use = ['datetime', 'dataset']
smoothed_columns = []
original_columns = ['casual_original', 'registered_original', 'count_original']

columns_not_to_use = columns_not_to_use + smoothed_columns + original_columns

directions = ['negative', 'positive']
rf = [True, True, True]
mlp = [False, False, False]
trees = [n, n, n]
max_iter = 2000
hidden_layer_sizes = [(100, 150, 50), (250, 350, 350, 150), (1, 1)]
max_iter_no_change = [10, 10, 10]


train_columns = ['casual_original', 'registered_original', 'count_original']
test_columns = train_columns


# create the pipelines
pipelines = create_pipelines(train_columns, directions, 
                             trees, hidden_layer_sizes,
                             max_iter_no_change, 
                             rf=rf, mlp=mlp, max_iter=max_iter)

# fit the pipelines
fitted_pipelines = fit_pipelines(pipelines, triaining_data_count3, 
                                 train_columns, columns_not_to_use,
                                 directions, rf=rf, mlp=mlp)

# predict the pipelines
predictions, dataframes = predict_pipelines(fitted_pipelines, triaining_data_count3, 
                                            train_columns, test_columns, 
                                            columns_not_to_use, test_period, 
                                            maximum_day, directions, rf=rf, mlp=mlp)

# evaluate the pipelines
triaining_data_count4 = merge_predictions(predictions, triaining_data_count3, test_columns, directions)

Fitting pipelines for casual_original
Maximum of the target: 367.0
Avergae of the target: 36.023899887167914

Fitting pipelines for registered_original
Maximum of the target: 886.0
Avergae of the target: 155.4472253564468

Fitting pipelines for count_original
Maximum of the target: 977.0
Avergae of the target: 191.47112524361472


Predicting pipelines...

Merging predictions...
Merging predictions for negative_casual
maximum value:  600.9441522906674
average value:  36.3763606490039
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Merging predictions for positive_casual
maximum value:  600.6332816290715
average value:  36.023899887167914
shape of df:  (6493, 43)
shape of total prediction:  (6493,)
shape of weights:  (6493,)
Maximum of the target prediction:  599.7796933576753
Average of the target prediction:  36.09767144803554

Merging predictions for negative_registered
maximum value:  1914.9551595089033
average value:  156.31238447319777
shape 

In [None]:
# extract the predictions for count from the training data
count_predictions = triaining_data_count4['positive']['y']['count']

# save the predictions into a csv file with the datetime column
count_predictions = pd.concat([triaining_data_count4['positive']['y']['datetime'], count_predictions], axis=1)

count_predictions.to_csv('../../data/processed/count_predictions3.csv', index=False)

In [None]:
count_predictions.head(50)

Unnamed: 0,datetime,count
431,2011-01-20 00:00:00,78.547512
432,2011-01-20 01:00:00,84.036977
433,2011-01-20 02:00:00,85.203285
434,2011-01-20 03:00:00,81.654897
435,2011-01-20 04:00:00,60.270158
436,2011-01-20 05:00:00,52.389292
437,2011-01-20 06:00:00,79.683771
438,2011-01-20 07:00:00,112.608661
439,2011-01-20 08:00:00,163.883517
440,2011-01-20 09:00:00,170.969929


In [None]:
count_predictions.head(50)

Unnamed: 0,datetime,count
431,2011-01-20 00:00:00,78.547512
432,2011-01-20 01:00:00,84.036977
433,2011-01-20 02:00:00,85.203285
434,2011-01-20 03:00:00,81.654897
435,2011-01-20 04:00:00,60.270158
436,2011-01-20 05:00:00,52.389292
437,2011-01-20 06:00:00,79.683771
438,2011-01-20 07:00:00,112.608661
439,2011-01-20 08:00:00,163.883517
440,2011-01-20 09:00:00,170.969929


In [None]:
len(count_predictions)

6493