In [1]:
import pandas as pd
import numpy as np
from  sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib inline
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
def root_mean_square_error(model,predictedval,actualval):
    print(model + ' Value ',np.sqrt(mean_squared_error(predictedval, actualval)))
    print(model + ' r2_score ',r2_score(predictedval,actualval))
    return np.sqrt(mean_squared_error(predictedval, actualval))

def runtime(t0):
    return time.time()-t0

In [3]:
train = pd.read_csv('PredictUpVoteCount/train_NIR5Yl1.csv')
train = train.drop(train[train.Views > 3000000].index)

labelencoder_X = LabelEncoder()
train['Tag'] = labelencoder_X.fit_transform(train['Tag'])
train.drop(['ID','Username'], axis=1,inplace =True)
target = train['Upvotes']

from sklearn.preprocessing import Binarizer
bn = Binarizer(threshold=7)
pd_watched = bn.transform([train['Answers']])[0]
train['pd_watched'] = pd_watched

In [4]:
feature_names = [x for x in train.columns if x not in ['Upvotes']]

x_train, x_val, y_train, y_val = train_test_split(train[feature_names], target,test_size = 0.22,random_state =205)
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_val = sc_X.transform(x_val)

poly_reg = PolynomialFeatures(degree = 4,interaction_only=False, include_bias=True)
X_poly = poly_reg.fit_transform(x_train)
poly_reg.fit(x_train, y_train)

x_val = poly_reg.fit_transform(x_val)

lin_reg_1 = linear_model.LassoLars(alpha=0.021,max_iter=150)
lin_reg_1.fit(X_poly, y_train)

# predicitng 
pred_val = lin_reg_1.predict(x_val)
root_mean_square_error('Lasso',pred_val,y_val)

Lasso Value  1031.2427124694427
Lasso r2_score  0.8919300198031876


1031.2427124694427

In [5]:
print(pred_val[0:10])

[ 91.18553616 101.86504551  43.73542342  28.52187173  40.49542904
 363.1722602   20.81405094  36.46497896 295.70219664  32.7740222 ]


In [6]:
print(y_val[0:10])

132133     58.0
280391     95.0
60588      22.0
73372      21.0
177614     10.0
113956    356.0
128235     10.0
139340     19.0
21479     440.0
40749       7.0
Name: Upvotes, dtype: float64


## Using GridSearchCV for Optimization

In [45]:
feature_names = [x for x in train.columns if x not in ['Upvotes']]

sc_y = StandardScaler()
X = StandardScaler().fit_transform(train[feature_names])
y = train['Upvotes'].values.reshape(-1,1)

x_train, x_val, y_train, y_val = train_test_split(X, y,test_size = 0.22,random_state =205)

poly_reg = PolynomialFeatures(degree = 4,interaction_only=False, include_bias=True)
X_poly = poly_reg.fit_transform(x_train)
poly_reg.fit(x_train, y_train)

x_val = poly_reg.fit_transform(x_val)

In [52]:
hyper_params = [
    {
        'alpha': (1e-04,0.001,0.01,0.021,0.1,1,),
        'normalize':[False]
    },
]

est=linear_model.LassoLars()

grid_clf = GridSearchCV(est,cv=10,param_grid=hyper_params,
                   verbose=0,n_jobs=1,scoring='r2')

t0 = time.time()
#fit model
grid_clf.fit(X_poly,y_train.ravel())
#get fit time
print('Time taken:',runtime(t0))

Time taken: 197.53719806671143


In [53]:
predval = grid_clf.best_estimator_.predict(x_val)
root_mean_square_error('Lasso',predval,y_val)

Lasso Value  1084.126024233726
Lasso r2_score  0.8822167303670048


1084.126024233726

In [54]:
grid_clf.best_estimator_

LassoLars(alpha=1, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
     fit_path=True, max_iter=500, normalize=False, positive=False,
     precompute='auto', verbose=False)

In [7]:
est=linear_model.LassoLars(alpha=1, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
     fit_path=True, max_iter=500, normalize=False, positive=False,
     precompute='auto', verbose=False)

est.fit(X_poly,y_train.ravel())

opredval = est.predict(x_val)
root_mean_square_error('Lasso',opredval,y_val)

Lasso Value  1084.1128450601204
Lasso r2_score  0.882218794620646


1084.1128450601204

In [8]:
print(opredval[0:10])

[235.31205151  20.30935546  44.13477657  27.29331949  12.84526146
 323.67588907  54.42431059  26.87823001 303.00505649 -19.75936277]


In [11]:
FinalPred = (pred_val+opredval)/2

root_mean_square_error('Lasso',FinalPred,y_val)

Lasso Value  1046.0078513679746
Lasso r2_score  0.8893064230727232


1046.0078513679746

## Test Submission

In [55]:
df_test = pd.read_csv('PredictUpVoteCount/test_8i3B3FC.csv')
ids = df_test['ID']
df_test.drop(['ID','Username'], axis=1,inplace =True)

labelencoder_X = LabelEncoder()
df_test['Tag'] = labelencoder_X.fit_transform(df_test['Tag'])

bn = Binarizer(threshold=7)
pd_watched = bn.transform([df_test['Answers']])[0]
df_test['pd_watched'] = pd_watched

df_test = sc_X.fit_transform(df_test)
df_test_poly = poly_reg.fit_transform(df_test)

#pred_test = lin_reg_1.predict(df_test_poly)
pred_test = grid_clf.best_estimator_.predict(df_test_poly)
pred_test=abs(pred_test)

In [56]:
submission = pd.DataFrame({'ID': ids,
                           'Upvotes':pred_test
                           })

submission.to_csv("final_sub7.csv",index=False)