## Problem Set 4

## BUSF-SHU 210: Business Analytics (Spring 2019)

## Question 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.preprocessing as prep
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler,robust_scale
from sklearn.model_selection import train_test_split, GridSearchCV , TimeSeriesSplit
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
#Drop one column from the dataset
#@param: df : dataframe to edit
#        String: the label
#@return: the edited dataframe
def dataDrop(df,String):
    x = df[df.columns.drop(String)]
    return x

In [3]:
#Eliminate outliers from the dataset
#here we set the outlier to be PM greater than 130. If any one would like to change it , simple change 130 to other values
#@param: df : dataframe to edit
#@return: the edited dataframe
def eliminateOutlier(data):
    outlier = np.where(data['pm2.5'].values > 130)[0]
    data.drop(data.index[outlier],inplace = True)
    return data

In [17]:
#Normalization method: robust scaler
#@param: X : the dataset
#@return: the normalized dataset
def normalize(X):
    normalized_X = robust_scale(X.reshape(-1, 1))
    return normalized_X

In [5]:
#preprocess the data
#change this method if want to change parameters
#@param: df : dataframe to edit
#@return: the edited dataframe
def dataPreprocessing(df):
    df['time'] = pd.to_datetime(df[['year','month','day','hour']])
    df['time'] = pd.to_datetime(df[['year','month','day','hour']])
    df['Day of the Week'] = df['time'].apply(lambda x: x.weekday())
    df = dataDrop(df,'No')
    df = dataDrop(df,'time')
    df = dataDrop(df,'month')
    df = dataDrop(df,'year')
    df = dataDrop(df,'day')
    return df

In [6]:
pm = pd.get_dummies(pd.read_csv("D:/Study/Business Analytics/LA/HW4/pm25.csv")).dropna()
pm = dataPreprocessing(pm)
pm = eliminateOutlier(pm)
pm = pd.get_dummies(pm,columns=['Day of the Week', 'hour'])
pm.head()

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
24,129.0,-16,-4.0,1020.0,1.79,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
29,109.0,-7,-6.0,1022.0,7.14,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
30,105.0,-7,-6.0,1023.0,8.93,4,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
31,124.0,-7,-5.0,1024.0,10.72,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
32,120.0,-8,-6.0,1024.0,12.51,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#split the train-test data: 20% test data, shuffled split
#then process the data into X and y form
dftr, dfte = train_test_split(pm, test_size=0.2)
y = dftr[['pm2.5']].values.ravel()
print(y.shape)
X = dftr[pm.columns.drop('pm2.5')]
yte = dfte[['pm2.5']].values
print(yte.shape)
Xte = dfte[pm.columns.drop('pm2.5')]

(24458,)
(6115, 1)


# Train-Validate

## Polynomial

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

param_test = {
        'poly__degree':range(1,3)
    }

estimator = Pipeline([('poly', PolynomialFeatures()),
                     ('linear', LinearRegression(fit_intercept=False))])
gsearch = GridSearchCV(estimator , param_grid = param_test, cv=10,scoring='neg_mean_squared_error')
gsearch.fit(X,normalize(y))
gsearch.best_params_, gsearch.best_score_
print('best score is:',str(gsearch.best_score_))
print('best params are:',str(gsearch.best_params_))

best score is: -0.21062499433871615
best params are: {'poly__degree': 2}


## Linear

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
lm.fit(X,normalize(y))
y_pred = lm.predict(X)
score = mean_squared_error(normalize(y),y_pred)

print('best score is:',str(score))

best score is: 0.22464335188285936


## Ridge Regression

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

param_test = {
        'alpha':np.logspace(-3, 0)
    }

estimator = Ridge()
gsearch = GridSearchCV(estimator , param_grid = param_test, cv=10,scoring='neg_mean_squared_error')
gsearch.fit(X,normalize(y))
gsearch.best_params_, gsearch.best_score_
print('best score is:',str(gsearch.best_score_))
print('best params are:',str(gsearch.best_params_))

best score is: -0.2254683542443233
best params are: {'alpha': 1.0}


## Lasso Regression

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

param_test = {
        'alpha':np.logspace(-3, 0)
    }

estimator = Lasso()
gsearch = GridSearchCV(estimator , param_grid = param_test, cv=10,scoring='neg_mean_squared_error')
gsearch.fit(X,normalize(y))
gsearch.best_params_, gsearch.best_score_
print('best score is:',str(gsearch.best_score_))
print('best params are:',str(gsearch.best_params_))

best score is: -0.22608951975526076
best params are: {'alpha': 0.001}


## CART

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

param_test = {
        'max_depth':range(1,5)
    }

estimator = DecisionTreeRegressor()
gsearch = GridSearchCV(estimator , param_grid = param_test, cv=10,scoring='neg_mean_squared_error')
gsearch.fit(X,normalize(y))
gsearch.best_params_, gsearch.best_score_
print('best score is:',str(gsearch.best_score_))
print('best params are:',str(gsearch.best_params_))

best score is: -0.24186369783214337
best params are: {'max_depth': 4}


# Test
## With this CV, we choose polynomial of degree 2

In [30]:
poly = Pipeline([('poly', PolynomialFeatures(degree=2)),
                     ('linear', LinearRegression(fit_intercept=False))])
poly.fit(X,normalize(y))
y_pred = poly.predict(Xte)
# for de-normalization
q25, q75 = np.percentile(y, [25, 75])
#de-normalize
y_pred = y_pred * (q75 - q25) + q25
score = mean_squared_error(yte,y_pred)
print('mse score is:',str(score))

mse score is: 1544.1783630900404
