## Variable Selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('data_norm_vif_v2.csv', sep=';', encoding='utf-8')

In [3]:
features = list(data.columns)
features.remove('stars')

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[features], data['stars'], test_size=0.25, random_state=42)

In [5]:
type(y_train), type(X_train)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

## Feature Selection (From Variable selection notebook)

In [6]:
data.columns

Index([u'district_1', u'district_10', u'district_11', u'district_12',
       u'district_13', u'district_14', u'district_15', u'district_16',
       u'district_17', u'district_18',
       ...
       u'WordCloudHighTri', u'WordCloudHighTriTail', u'WCSentAllTriTail',
       u'RevLen', u'Points', u'TriLowProb', u'TriMedProb', u'TriHighProb',
       u'funny', u'stars'],
      dtype='object', length=116)

In [7]:
# Personalize feature selection 
cols_sel =  ['Points', 'WCSentEng', 'NegCloud', 'RevLen', 'WordCloudHigh', 'WordCloudHighGer', 'WCSentGer', 'review_count', 'funny', 'TriLowProb', 'WordCloudHighTri', 'main_cat_Others', 'main_cat_Beauty & Spas', 'main_cat_Shopping', 'main_cat_Food', 'TriHighProb']
cols_sel, len(cols_sel)

(['Points',
  'WCSentEng',
  'NegCloud',
  'RevLen',
  'WordCloudHigh',
  'WordCloudHighGer',
  'WCSentGer',
  'review_count',
  'funny',
  'TriLowProb',
  'WordCloudHighTri',
  'main_cat_Others',
  'main_cat_Beauty & Spas',
  'main_cat_Shopping',
  'main_cat_Food',
  'TriHighProb'],
 16)

In [8]:
X_train2 = X_train[cols_sel]
X_test2 = X_test[cols_sel]

## Regression Model --------------------------------------------------------------------------------------------------

In [9]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
#linear_model.LinearRegression() with all columns 

LinReg = linear_model.LinearRegression().fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, LinReg.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, LinReg.predict(X_test)) 
print 'r2 Train:     ',LinReg.score(X_train,y_train)
print 'r2 Validation:',LinReg.score(X_test,y_test)

Mean Squared Error Train: 0.374269468314
Mean Squared Error Validation: 0.361941854399
r2 Train:      0.428263648605
r2 Validation: 0.415639161043


In [11]:
#linear_model.LinearRegression() with selected columns

LinReg = linear_model.LinearRegression().fit(X_train2, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, LinReg.predict(X_train2)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, LinReg.predict(X_test2)) 
print 'r2 Train:     ',LinReg.score(X_train2,y_train)
print 'r2 Validation:',LinReg.score(X_test2,y_test)

Mean Squared Error Train: 0.398353814096
Mean Squared Error Validation: 0.37845903661
r2 Train:      0.391472253236
r2 Validation: 0.38897190956


## Lasso Model --------------------------------------------------------------------------------------------------

In [14]:
vs = linear_model.Lasso(alpha=0.1, random_state=42)
vs.fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, vs.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, vs.predict(X_test)) 

Mean Squared Error Train: 0.438534906446
Mean Squared Error Validation: 0.417354519135


In [15]:
vs = linear_model.Lasso(alpha=0.001, random_state=42)
vs.fit(X_train2, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, vs.predict(X_train2)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, vs.predict(X_test2)) 

Mean Squared Error Train: 0.398451279398
Mean Squared Error Validation: 0.378227525385


In [16]:
vs = linear_model.Lasso(alpha=0.001, random_state=42)
vs.fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, vs.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, vs.predict(X_test)) 

Mean Squared Error Train: 0.383253503766
Mean Squared Error Validation: 0.363746073229


## Random Forest --------------------------------------------------------------------------------------------------------


In [61]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid, GridSearchCV

param_grid = {
   'n_estimators': [50,100,150],
   'max_features': [0.3,0.5,0.7,'log2','sqrt'],
   'max_depth': [10,14]
    }

lista = list(ParameterGrid(param_grid))
len(lista)
rfr=RandomForestRegressor(random_state=1)

gs = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
gs.fit(X_train, y_train)
gs.best_score_

-0.34233674341251225

In [62]:
gs.best_params_

{'max_depth': 14, 'max_features': 0.5, 'n_estimators': 150}

In [63]:
RandFor = RandomForestRegressor(random_state=1, max_depth= 14, max_features= 0.5, n_estimators= 150).fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, RandFor.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, RandFor.predict(X_test)) 

Mean Squared Error Train: 0.128306844718
Mean Squared Error Validation: 0.328762216347


In [68]:
RandFor = RandomForestRegressor(random_state=1, max_depth= 14, max_features= 0.5, n_estimators= 250).fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, RandFor.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, RandFor.predict(X_test)) 

Mean Squared Error Train: 0.127659655061
Mean Squared Error Validation: 0.328167981924


## Gradient Boosting Regressor ----------------------------------------------------------------------------------


## Gradient Boosting Regressor n_estimators: 100, max_depth:1, loss: ls

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor 

In [18]:
# With all columns
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                 max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, est.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, est.predict(X_test)) 
print 'r2 Train:     ',est.score(X_train,y_train)
print 'r2 Validation:',est.score(X_test,y_test)

Mean Squared Error Train: 0.370504266024
Mean Squared Error Validation: 0.366803609199
r2 Train:      0.434015394878
r2 Validation: 0.407789781153


In [19]:
# With selected columns 
estColSels = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                 max_depth=1, random_state=0, loss='ls').fit(X_train2, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, estColSels.predict(X_train2)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, estColSels.predict(X_test2)) 
print 'r2 Train:     ',estColSels.score(X_train2,y_train)
print 'r2 Validation:',estColSels.score(X_test2,y_test)

Mean Squared Error Train: 0.371883837317
Mean Squared Error Validation: 0.367441166362
r2 Train:      0.431907953251
r2 Validation: 0.406760435046


## Gradient Boosting Regressor n_estimators: 200, max_depth:2, loss: ls

In [21]:
# With all columns
est200 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                max_depth=2, random_state=0, loss='ls').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, est200.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, est200.predict(X_test)) 
print 'r2 Train:     ',est.score(X_train,y_train)
print 'r2 Validation:',est.score(X_test,y_test)

Mean Squared Error Train: 0.313703044306
Mean Squared Error Validation: 0.34111327629
r2 Train:      0.434015394878
r2 Validation: 0.407789781153


In [22]:
#With selected columns

estColSels200 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                 max_depth=2, random_state=0, loss='ls').fit(X_train2, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, estColSels200.predict(X_train2)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, estColSels200.predict(X_test2)) 
print 'r2 Train:     ',estColSels200.score(X_train2,y_train)
print 'r2 Validation:',estColSels200.score(X_test2,y_test)

Mean Squared Error Train: 0.316182292778
Mean Squared Error Validation: 0.345190842203
r2 Train:      0.516997976717
r2 Validation: 0.44268393473


## Gradient Boosting Regressor n_estimators: 100, max_depth:2, loss:huber

In [23]:
#With all columns
esthuber = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                 max_depth=2, random_state=0, loss='huber').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, esthuber.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, esthuber.predict(X_test)) 
print 'r2 Train:     ',esthuber.score(X_train,y_train)
print 'r2 Validation:',esthuber.score(X_test,y_test)

Mean Squared Error Train: 0.339558831736
Mean Squared Error Validation: 0.349173076173
r2 Train:      0.481287831425
r2 Validation: 0.436254555108


## Gradient Boosting Regressor n_estimators: 200, max_depth:2, loss:huber, learning_rate=0.1

In [24]:
#With all columns
estColSels3200 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                 max_depth=2, random_state=0, loss='huber').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, estColSels3200.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, estColSels3200.predict(X_test)) 
print 'r2 Train:     ',estColSels3200.score(X_train,y_train)
print 'r2 Validation:',estColSels3200.score(X_test,y_test)

Mean Squared Error Train: 0.316816576076
Mean Squared Error Validation: 0.340251720851
r2 Train:      0.516029041634
r2 Validation: 0.450658224142


## Gradient Boosting Regressor n_estimators: 200, max_depth:3, loss:huber, learning_rate=0.09

In [41]:
#With all columns
estColSels3200 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.09,
                                 max_depth=3, random_state=0, loss='huber').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, estColSels3200.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, estColSels3200.predict(X_test)) 
print 'r2 Train:     ',estColSels3200.score(X_train,y_train)
print 'r2 Validation:',estColSels3200.score(X_test,y_test)

Mean Squared Error Train: 0.279515017557
Mean Squared Error Validation: 0.335877071132
r2 Train:      0.573011132813
r2 Validation: 0.457721165189


## Gradient Boosting Regressor n_estimators: 300, max_depth:3, loss:huber, learning_rate=0.08

In [26]:
estColSels3200 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.08,
                                max_depth=3, random_state=0, loss='huber').fit(X_train, y_train)
print 'Mean Squared Error Train:', mean_squared_error(y_train, estColSels3200.predict(X_train)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, estColSels3200.predict(X_test)) 
print 'r2 Train:     ',estColSels3200.score(X_train,y_train)
print 'r2 Validation:',estColSels3200.score(X_test,y_test)

Mean Squared Error Train: 0.267253735552
Mean Squared Error Validation: 0.330667641945
r2 Train:      0.591741542933
r2 Validation: 0.466131870867


## Ensemble Gradient Boosting and Linear Regression

In [85]:
# Setup matrix to merge both models
x = np.zeros((6012, 2))
xtest = np.zeros((2004, 2))

In [91]:
# Merge of both models and predict for each of them

estColSels3200.fit(X_train, y_train)
LinReg.fit(X_train, y_train)

x[:, 0] = estColSels3200.predict(X_train)
x[:, 1] = LinReg.predict(X_train)

x.shape

(6012L, 2L)

In [92]:
# Prediction for train
xtest[:, 0] = estColSels3200.predict(X_test)
xtest[:, 1] = LinReg.predict(X_test)

In [93]:
u = linear_model.LinearRegression().fit(x, y_train)

print metrics.r2_score(y_test,u.predict(xtest))
print 'Mean Squared Error Train:', mean_squared_error(y_train, u.predict(x)) 
print 'Mean Squared Error Validation:', mean_squared_error(y_test, u.predict(xtest)) 

0.445214564323
Mean Squared Error Train: 0.26101687767
Mean Squared Error Validation: 0.34362341895
