# Applied Linear Regression Models to Predict Engagement
* Engagement was calculated with (Comments + likes) + (Replies + likes)

In [2]:
import numpy as np
import pandas as pd
import time
import pickle

import matplotlib.pyplot as plt




# Model 1

* "Comment" column included and normalized

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression



with open('cleaneddata.pkl', 'r') as picklefile:
    data = pickle.load(picklefile)
datay = data['Engagement'].copy()
data3=data.drop(['Recipe_URL', 'Ingredients','Recipe_Name','Engagement'], axis =1).copy()
nc = ['Comments']
data3.ix[:, nc] = (data3.ix[:, nc] - data3.ix[:, nc].min())\
                   / (data3.ix[:, nc].max() - data3.ix[:, nc].min())


In [3]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
       data3, datay, test_size=0.30, random_state =101)

In [12]:
y = StandardScaler()
df7= y.fit_transform(data3)

In [53]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()

lm.fit(X_train, y_train)

print(lm.intercept_)

coeff_df = pd.DataFrame(lm.coef_.T,X_train.columns,columns=['Coefficient'])
coeff_df.sort(columns='Coefficient',ascending=False)


-2.18161588779




Unnamed: 0,Coefficient
"Kimchi Spice Blend (Gochugaru, Granulated Sugar, Garlic Powder, Ground Ginger, Black Sesame Seeds & White Sesame Seeds)\n\n",309.309945
Comments,271.102795
Round Pugliese Bread\n\n,195.397140
Cornish Game Hen\n\n,193.458052
Whole Rosemary\n\n,170.600693
Porcini Mushrooms\n\n,146.234839
"Mexican Spice Blend (Ancho Chile Powder, Sweet Paprika, Garlic Powder, Ground Cumin & Whole Mexican Oregano)\n\n",136.647429
Plum Tomato\n\n,128.345656
"Chicken Tarkari Spice Blend (Curry Powder, Ground Cumin, Ground Cinnamon & Ground )\n\n",125.117723
Endives\n\n,119.715804


In [65]:
predictions = lm.predict(X_test)
#r2 meansqaured error

In [66]:
print predictions[0:20]

[ 123.49082138   30.98272929   31.90738974 -144.17088059  335.61983025
   40.9795262   236.65548732  279.68535049  -46.23407815  120.83683338
   80.30645308   26.62597143   -2.34987714  209.87976501   92.50383419
   88.72446106  -22.17985993  228.45701958   60.81001215   55.74108856]


In [67]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test.values,predictions)

8002.1241261475834

## RMSE was 90 but this was misleading, the dataset did not have comments seperated out and comments are already factored into the engagement score

# Model 2 
* with out "Comments"

In [4]:
datay2 = data['Engagement'].copy()
data32 =data.drop(['Recipe_URL', 'Ingredients','Recipe_Name','Comments', 'Engagement'], axis =1).copy()


In [5]:

X3_train, X3_test, y3_train, y3_test = train_test_split(
       data32, datay, test_size=0.30, random_state =101)

In [82]:
df7= y.fit_transform(data32)


lm.fit(X3_train, y3_train)

print(lm.intercept_)

coeff_df2 = pd.DataFrame(lm.coef_.T,X_train.columns,columns=['Coefficient'])
coeff_df2.sort(columns='Coefficient',ascending=False)

predictions = lm.predict(X3_test)
mean_squared_error(y3_test.values,predictions)

180.804010558




18537.130659155184

## First Model ran with an RMSE of 137

# Model 3 Combined with Stacking regressor

* "Comment" column exclude since its intricately related to Engagement

In [70]:
datay2 = data['Engagement'].copy()
data32 =data.drop(['Recipe_URL', 'Ingredients','Recipe_Name','Comments', 'Engagement'], axis =1).copy()


In [5]:
from sklearn.cross_validation import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(
       data32, datay, test_size=0.30, random_state =101)

In [6]:
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.cross_validation import KFold
from sklearn import cross_validation




In [78]:
lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')

stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], 
                           meta_regressor=svr_rbf)

# Training the stacking classifier

stregr.fit(X2_train, y2_train)
stregr.predict(X2_test)

# Evaluate and visualize the fit

print("Mean Squared Error: %.4f"
      % np.mean((stregr.predict(X2_train) - y2_train) ** 2))
print('Variance Score: %.4f' % stregr.score(X2_train, y2_train))
'''
with plt.style.context(('seaborn-whitegrid')):
    plt.scatter(X, y, c='lightgray')
    plt.plot(X, stregr.predict(X), c='darkgreen', lw=2)

plt.show()'''

Mean Squared Error: 9812.2164
Variance Score: -0.2632


"\nwith plt.style.context(('seaborn-whitegrid')):\n    plt.scatter(X, y, c='lightgray')\n    plt.plot(X, stregr.predict(X), c='darkgreen', lw=2)\n\nplt.show()"

### RMSE

In [79]:
np.sqrt(9812.2164)

99.056632286788343

# Model 4
* ridge reggresion
* lasso
* gridsearch

In [85]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Lasso

# Initializing models

lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
lasso = Lasso(random_state=1)
svr_rbf = SVR(kernel='rbf')
regressors = [svr_lin, lr, ridge, lasso]
stregr = StackingRegressor(regressors=regressors, 
                           meta_regressor=svr_rbf)

params = {'lasso__alpha': [0.1, 1.0, 10.0],
          'ridge__alpha': [0.1, 1.0, 10.0],
          'svr__C': [0.1, 1.0, 10.0],
          'meta-svr__C': [0.1, 1.0, 10.0, 100.0],
          'meta-svr__gamma': [0.1, 1.0, 10.0]}

grid = GridSearchCV(estimator=stregr, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X2_train, y2_train)

for params, mean_score, scores in grid.grid_scores_:
        print("%0.3f +/- %0.2f %r"
              % (mean_score, scores.std() / 2.0, params))

  

-0.292 +/- 0.02 {'ridge__alpha': 0.1, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 0.1, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 0.1, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 1.0, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 0.1, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 10.0, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 1.0, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 0.1, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 1.0, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 1.0, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 1.0, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 10.0, 'meta-svr__gamma': 0.1}
Mean Squared Error: 7974.4175
-0.292 +/- 0.02 {'ridge__alpha': 10.0, 'meta-svr__C': 0.1, 'lasso__alpha': 0.1, 'svr__C': 0.1, 'meta-svr__gamm

NameError: name 'X' is not defined

In [86]:
      print("Mean Squared Error: %.4f"
      % np.mean((grid.predict(X2_test) - y2_test) ** 2))
print('Variance Score: %.4f' % grid.score(X2_test, y2_test))

Mean Squared Error: 7974.4175
Variance Score: -0.0148


# Model 5 previous models with crossvalidation

In [77]:
#X3_train, X3_test, y3_train, y3_test
from mlxtend.classifier import StackingCVClassifier
lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')
lasso = Lasso(random_state=1)

stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], 
                           meta_regressor=svr_rbf)

stregr.fit(data32, datay2)
stregr.predict(data32)

print("Mean Squared Error: %.4f"
      % np.mean((stregr.predict(data32) - datay) ** 2))
print('Variance Score: %.4f' % stregr.score(data32, datay2))

print np.sqrt(np.mean((stregr.predict(data32) - datay) ** 2))

Mean Squared Error: 9804.9740
Variance Score: -0.2576
99.0200688571


# Model 6

In [7]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV

from sklearn.kernel_ridge import KernelRidge
svr = GridSearchCV(SVR(kernel='linear', gamma=0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})
kr.fit(X2_train,y2_train)
svr.fit(X2_train,y2_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [8]:
from sklearn.metrics import mean_squared_error

predictionsvr = svr.predict(X2_test)
print np.sqrt(mean_squared_error(y2_test.values,predictionsvr))
predictionkr = kr.predict(X2_test)
print np.sqrt(mean_squared_error(y2_test.values,predictionkr))

81.7144958689
80.8309941391


### Best MSE with a score 82 and 81

# Models 7 & 8

In [10]:
svr2 = SVR(kernel='linear', gamma =0.1)
svr2.fit(X2_train,y2_train)
predictionsvr2 = svr2.predict(X2_test)
print np.sqrt(mean_squared_error(y2_test.values,predictionsvr2))

92.8105204745


In [11]:
svr3 = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
svr3.fit(X2_train,y2_train)
predictionsvr3 = svr3.predict(X2_test)
print np.sqrt(mean_squared_error(y2_test.values,predictionsvr3))

92.8105204745


### Additonal Models did not run as well