In [50]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train_process_2.csv')
test = pd.read_csv('test_process_2.csv')

In [15]:
X_train = train.iloc[:,0:96]
y_train = train['averageRating']
X_test = test.iloc[:,0:96]
y_test = test['averageRating']
print('X_train: {}\nX_test: {}\ny_train:{}\ny_test:{}'.format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X_train: (29760, 96)
X_test: (7440, 96)
y_train:(29760,)
y_test:(7440,)


# without text / regression / fine tune

## lightGBM 

In [16]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

  import pandas.util.testing as tm


In [17]:
param_grid = {'num_leaves': [27, 31, 35],
         'bagging_fraction': [1, 0.9, 0.8],
         'feature_fraction': [1, 0.9, 0.8]}

model_lgb = lgb.LGBMRegressor()
model_lgb_cv = GridSearchCV(model_lgb, param_grid, cv=5)
model_lgb_cv.fit(X_train, y_train)









GridSearchCV(cv=5, estimator=LGBMRegressor(),
             param_grid={'bagging_fraction': [1, 0.9, 0.8],
                         'feature_fraction': [1, 0.9, 0.8],
                         'num_leaves': [27, 31, 35]})

In [18]:
print('Best parameters are:\n', model_lgb_cv.best_params_)

Best parameters are:
 {'bagging_fraction': 1, 'feature_fraction': 0.8, 'num_leaves': 31}


In [19]:
from sklearn.metrics import mean_squared_error

y_pred = model_lgb_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned lightGBM regressor MSE: {}".format(mse))

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Tuned lightGBM regressor MAE: {}".format(mae))

Tuned lightGBM regressor MSE: 0.6972313631000047
Tuned lightGBM regressor MAE: 0.6301406093599041


## random forest 

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
param_grid = {'n_estimators': [90, 100, 110],
         'min_samples_split': [2, 4, 6, 8],
         'bootstrap': [True, False]}

model_rf = RandomForestRegressor()
model_rf_cv = GridSearchCV(model_rf, param_grid, cv=5)
model_rf_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True, False],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [90, 100, 110]})

In [23]:
print('Best parameters are:\n', model_rf_cv.best_params_)

Best parameters are:
 {'bootstrap': True, 'min_samples_split': 8, 'n_estimators': 110}


In [24]:
y_pred = model_rf_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned random forest regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned random forest regressor MAE: {}".format(mae))

Tuned random forest regressor MSE: 0.7335679642763566
Tuned random forest regressor MAE: 0.6446698979365978


## xgboost 

In [25]:
import xgboost as xgb

In [27]:
param_grid = {'max_depth': [4, 6, 8, 10],
         'subsample': [1, 0.9, 0.8],
         'eta': [0.2, 0.3, 0.4]}

model_xgb = xgb.XGBRegressor()
model_xgb_cv = GridSearchCV(model_xgb, param_grid, cv=5)
model_xgb_cv.fit(X_train, y_train)







GridSearchCV(cv=5, estimator=XGBRegressor(),
             param_grid={'eta': [0.2, 0.3, 0.4], 'max_depth': [4, 6, 8, 10],
                         'subsample': [1, 0.9, 0.8]})

In [28]:
print('Best parameters are:\n', model_xgb_cv.best_params_)

Best parameters are:
 {'eta': 0.2, 'max_depth': 8, 'subsample': 0.8}


In [29]:
y_pred = model_xgb_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned random forest regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned random forest regressor MAE: {}".format(mae))

Tuned random forest regressor MSE: 0.6946319491710811
Tuned random forest regressor MAE: 0.6283762846710862


## Linear model

In [46]:
from sklearn.linear_model import ElasticNet

In [47]:
param_grid = {'l1_ratio':np.linspace(0,1,30)}

model_linear = ElasticNet()
model_linear_cv = GridSearchCV(model_linear, param_grid, cv=5)
model_linear_cv.fit(X_train, y_train)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'l1_ratio': array([0.        , 0.03448276, 0.06896552, 0.10344828, 0.13793103,
       0.17241379, 0.20689655, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.4137931 , 0.44827586, 0.48275862,
       0.51724138, 0.55172414, 0.5862069 , 0.62068966, 0.65517241,
       0.68965517, 0.72413793, 0.75862069, 0.79310345, 0.82758621,
       0.86206897, 0.89655172, 0.93103448, 0.96551724, 1.        ])})

In [48]:
print('Best parameters are:\n', model_linear_cv.best_params_)

Best parameters are:
 {'l1_ratio': 0.0}


In [49]:
y_pred = model_linear_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned elastic net regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned elastic net regressor MAE: {}".format(mae))

Tuned elastic net regressor MSE: 1.2039824665943668
Tuned elastic net regressor MAE: 0.8435908569256084


In [42]:
from sklearn.linear_model import LinearRegression

In [43]:
param_grid = {'normalize': [True, False]}

model_linear = LinearRegression()
model_linear_cv = GridSearchCV(model_linear, param_grid, cv=5)
model_linear_cv.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'normalize': [True, False]})

In [44]:
print('Best parameters are:\n', model_linear_cv.best_params_)

Best parameters are:
 {'normalize': False}


In [45]:
y_pred = model_linear_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned linear regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned linear regressor MAE: {}".format(mae))

Tuned linear regressor MSE: 0.844070742498122
Tuned linear regressor MAE: 0.6987720809444304


## decision tree 

In [51]:
from sklearn import tree

In [70]:
param_grid = {'min_samples_split': [130, 140, 150, 160, 170, 180, 190, 200]}

model_tree = tree.DecisionTreeRegressor()
model_tree_cv = GridSearchCV(model_tree, param_grid, cv=5)
model_tree_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'min_samples_split': [130, 140, 150, 160, 170, 180,
                                               190, 200]})

In [71]:
print('Best parameters are:\n', model_tree_cv.best_params_)

Best parameters are:
 {'min_samples_split': 180}


In [72]:
y_pred = model_tree_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned decision tree regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned decision tree regressor MAE: {}".format(mae))

Tuned decision tree regressor MSE: 0.7856169503951305
Tuned decision tree regressor MAE: 0.6687023308269576
