# Ridership Prediction Monthly Model

## Importing packages

In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import math
from pandas import read_csv
from pandas import read_excel
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error      

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVR

### Reading data

In [1]:
path = './Features Data v2/'

dateparse = lambda dates: [pd.datetime.strptime(d, '%Y-%m-%d') for d in dates]
dateparse2 = lambda dates: [pd.datetime.strptime(d, '%m/%d/%Y') for d in dates]
    
usage = read_csv(filepath_or_buffer=(path+'usage_v4_3.csv'), 
                   parse_dates=['Date'], date_parser=dateparse2)

school_status = read_csv(filepath_or_buffer=(path+'school_status_0_1.csv'), 
                   parse_dates=['Date'], date_parser=dateparse)

student_count = read_csv(filepath_or_buffer=(path+'students_out_of_school_count.csv'), 
                   parse_dates=['Date'], date_parser=dateparse)

df = student_count.merge(usage, on='Date', how='left')
df['day_of_week'] = df.Date.dt.dayofweek + 1
df['month'] = df.Date.dt.month

In [None]:
# Creating dummies for categorical data
def to_dummies(df):
    for column in df.columns:         
        if df[column].dtype == np.object:
            df_col = pd.get_dummies(df[column])
            df = pd.concat([df, df_col], axis=1)
            df = df.drop(column, axis=1)
    return df

df.Holiday = Series(np.where(df.Holiday.isnull(), 'Regular', df.Holiday))

df.month = 'month ' + df.month.astype('str')
df.day_of_week = 'Weekday ' + df.day_of_week.astype('str')
df = to_dummies(df)

In [None]:
df_all = read_csv(filepath_or_buffer=(path+'static_df_all_predictors_LYLY.csv'), 
                   parse_dates=['Date'], date_parser=dateparse2)
df = df.merge(df_all, on='Date', how='left')

### Class object for monthly predictive model

In [None]:
class model_fit:    
    
    def __init__(self, df, features=10):
        self.df = df
        self.k = features
        import math
        import numpy
        from pandas import Series, DataFrame
        import sklearn
        from sklearn import linear_model
        from sklearn.metrics import mean_squared_error, r2_score
        from sklearn.metrics import mean_absolute_error                
    
    def get_Xy(self, df):
        # Split data into predictor and response
        return df.drop(['Date', 'Ridership'], axis=1), df.loc[:, 'Ridership']

    def performance(self, y_test, y_pred):
        print '============== Month ' + str(self.month) +' =============='
        # The mean absolute error
        print("Mean absolute error: %.2f"
              % mean_absolute_error(y_test, y_pred))

        # The root mean squared error
        print("Root Mean squared error: %.2f"
              % math.sqrt(mean_squared_error(y_test, y_pred)))

        # Explained variance score: 1 is perfect prediction
        print('Variance score(R2): %.2f' % r2_score(y_test, y_pred))

        # Mean Absolute Percentage Error
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        print 'Mean Absolute Percentage Error: {0:.2f}'.format(mape)
        print '-------------------------------------\n'

    def feature_selection(self.df, self.k):
        from sklearn.feature_selection import SelectKBest, f_regression
        
        df = self.df
        k = self.k
        X = df.drop(['Date', 'Ridership'], axis=1)
        y = df['Ridership']
        selector = SelectKBest(f_regression, k = k)
        selector.fit(X, y)
        scores_a = selector.pvalues_
        score = list()
        for i in scores_a:
            score.append(i)
        kbest_zip = zip(X.columns,score)
        k_best = pd.DataFrame(kbest_zip)
        k_best = k_best.sort_values(by=1, ascending = False)



        #top 10 features
        top_10 = k_best.tail(k)
        top_10_features = top_10[0]
        top_10_features = top_10_features.values
        return top_10_features
    
    def predict(self, regr=linear_model.LinearRegression(), month=1, train_year=None, test_year=None):
        df = self.df
        self.month = month
        if not train_year:        
            if month in [1, 2, 3, 4, 5, 6, 7, 8]:
                train_year = [2015, 2016]
                test_year = [2017]
            elif month == 9:
                train_year = [2014, 2015, 2016]
                test_year = [2017]
            else:
                train_year = [2014, 2015]
                test_year = [2016]
                
        # Filtering for that particular month
        df_month = df[df.Date.dt.month == month]

        #Drop days with 0 Riderships
        df_month = df_month.drop(df_month[df_month.Ridership == 0].index)

        #Splitting data for training and testing
        df_train = df_month[df_month.Date.dt.year.isin(train_year)]
        df_test = df_month[df_month.Date.dt.year.isin(test_year)]
        self.df_test = df_test

        #Splitting data into training and testing
        X_train, y_train = self.get_Xy(df_train)
        X_test, y_test = self.get_Xy(df_test)
        self.y_test = y_test

        # Model Fitting andn prediction
        y_pred = regr.fit(X_train, y_train).predict(X_test)
        self.y_pred = y_pred
        
        # Print Performance 
        self.performance(y_test, y_pred)

    def get_pred_df(self):
        # Returns the dataframe of prediction period with Actual predictions and absolute Error
        df_pred = self.df_test
        df_pred = df_pred.reset_index().drop('index', axis=1)
        df_pred['Prediction'] = pd.Series(self.y_pred)
        df_pred['Error'] = (df_pred['Prediction'] - df_pred['Ridership'])
        df_pred['% Error'] = df_pred['Error'] * 100.0 / df_pred['Ridership']
        df_pred['Weekday'] = df_pred.Date.dt.weekday_name
        
        #Filtering only required variables
        df_pred = df_pred.loc[:, [ u'Date', u'Weekday', u'total_student_out_of_school_count', u'Ridership',
                                  u'Prediction', u'Error', u'LY Ridership', u'LY2 Ridership',
                                  u'% Error']]
        return df_pred
    
    def sq_error(self):
        # Return list of square error for each day in the month
        se = list((self.y_pred - self.y_test)**2)
        return se

### Feature selection for each month

In [None]:
df_jan = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                    u'Ridership', u'LY Ridership', #u'LY2 Ridership',
                    u'MLK Day', 'New Year\'s Day',
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_feb = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                    u'Ridership', u'LY Ridership',# u'LY2 Ridership',
                    u"President's Day", 
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_march = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                      u'Ridership', u'LY Ridership',#u'LY2 Ridership',
                     # 'SpringBreak',
                      u'Easter', u'Good Friday', u'Palm Sunday', u"St. Patrick's Day",
                      u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4',
                      u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_april = df.loc[:, [u'Date', u'total_student_out_of_school_count', 
                      u'Ridership', u'LY Ridership',# u'LY2 Ridership',
                      u'Easter', u'Good Friday', 
                      u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                      u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_may = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                    u'Ridership', u'LY Ridership', #u'LY2 Ridership',
                    u'Mpls School Patrol', u'Suburban School Patrol Day 1',
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_june = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                     u'Ridership', u'LY Ridership', #u'LY2 Ridership',
                     u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                     u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_july = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                     u'Ridership', u'LY Ridership', #u'LY2 Ridership',
                     u'Eid Al-Fitr - Day After', 
                     u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                     u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_aug = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                    u'Vikings_Home_Game',
                    u'Ridership', u'LY Ridership',# u'LY2 Ridership',
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_sept = df.loc[:, [u'Date', u'total_student_out_of_school_count', 
                     u'Ridership', u'LY Ridership', #'LY2 Ridership', 
                     u'Vikings_Home_Game',
                     u'Labor Day', 
                     u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                     u'Weekday 5', u'Weekday 6', u'Weekday 7']]

df_oct = df.loc[:, [u'Date', u'total_student_out_of_school_count', 
                    u'Ridership', u'LY Ridership',#u'LY2 Ridership',                    
                    u'Weekday 1', u'Weekday 2',u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7', ]]


df_nov = df.loc[:, [u'Date', u'total_student_out_of_school_count',
                    u'Ridership', u'LY Ridership',#u'LY2 Ridership',
                    u'Black Friday', u'Thanksgiving', 
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]


df_dec = df.loc[:, [u'Date', u'total_student_out_of_school_count', 
                    u'Ridership', u'LY Ridership', #u'LY2 Ridership',
                    u'ChristmasWeek', u"New Year's Eve",
                    u'Weekday 1', u'Weekday 2', u'Weekday 3', u'Weekday 4', 
                    u'Weekday 5', u'Weekday 6', u'Weekday 7']]


df_month = {'1': df_jan, '2': df_feb, '3': df_march, '4': df_april, '5': df_may, 
           '6': df_june, '7': df_july, '8': df_aug, '9':df_sept, '10':df_oct,
           '11':df_nov, '12':df_dec}

### Monthly Predictions

In [None]:
se_list = list()
for i in range(1, 13):  
    
    model = model_fit(df_month[str(i)])
    regr = linear_model.LinearRegression()
    # regr = linear_model.Lasso(alpha=10)
    # regr = GradientBoostingRegressor(loss='lad', max_depth=3)
    # regr = DecisionTreeRegressor()
    # regr = RandomForestRegressor()
    # regr = AdaBoostRegressor()
    # regr = GradientBoostingRegressor()

    model.predict(regr=regr, month=i)
    se_list = se_list + model.sq_error()
print 'RMSE for whole year = {0}'.format(np.sqrt(np.mean(se_list)))