In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV



In [2]:
train = pd.read_csv('train_process_2.csv')
test = pd.read_csv('test_process_2.csv')

In [3]:
X_train = train.drop(columns=['averageRating'])
y_train = train['averageRating']
X_test = test.drop(columns=['averageRating'])
y_test = test['averageRating']
print('X_train: {}\nX_test: {}\ny_train:{}\ny_test:{}'.format(X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X_train: (29760, 865)
X_test: (7440, 865)
y_train:(29760,)
y_test:(7440,)


# with text / regression / fine tune

##  xgboost

In [4]:
import xgboost as xgb

In [5]:
param_grid = {'max_depth': [4, 6, 8, 10],
         'subsample': [1, 0.9, 0.8]}

model_xgb = xgb.XGBRegressor()
model_xgb_cv = GridSearchCV(model_xgb, param_grid, cv=5)
model_xgb_cv.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=XGBRegressor(),
             param_grid={'max_depth': [4, 6, 8, 10],
                         'subsample': [1, 0.9, 0.8]})

In [6]:
print('Best parameters are:\n', model_xgb_cv.best_params_)

Best parameters are:
 {'max_depth': 6, 'subsample': 1}


In [7]:
y_pred = model_xgb_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned random forest regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned random forest regressor MAE: {}".format(mae))

Tuned random forest regressor MSE: 0.6323248020340614
Tuned random forest regressor MAE: 0.6063349912115322


## linear model 

In [13]:
from sklearn.linear_model import ElasticNet

In [14]:
param_grid = {'l1_ratio':np.linspace(0,1,30)}

model_linear = ElasticNet()
model_linear_cv = GridSearchCV(model_linear, param_grid, cv=5)
model_linear_cv.fit(X_train, y_train)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'l1_ratio': array([0.        , 0.03448276, 0.06896552, 0.10344828, 0.13793103,
       0.17241379, 0.20689655, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.4137931 , 0.44827586, 0.48275862,
       0.51724138, 0.55172414, 0.5862069 , 0.62068966, 0.65517241,
       0.68965517, 0.72413793, 0.75862069, 0.79310345, 0.82758621,
       0.86206897, 0.89655172, 0.93103448, 0.96551724, 1.        ])})

In [15]:
print('Best parameters are:\n', model_linear_cv.best_params_)

Best parameters are:
 {'l1_ratio': 0.0}


In [16]:
y_pred = model_linear_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned elastic net regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned elastic net regressor MAE: {}".format(mae))

Tuned elastic net regressor MSE: 1.075296291681353
Tuned elastic net regressor MAE: 0.7945807113951325


In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
param_grid = {'normalize': [True, False]}

model_linear = LinearRegression()
model_linear_cv = GridSearchCV(model_linear, param_grid, cv=5)
model_linear_cv.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'normalize': [True, False]})

In [11]:
print('Best parameters are:\n', model_linear_cv.best_params_)

Best parameters are:
 {'normalize': True}


In [12]:
y_pred = model_linear_cv.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Tuned linear regressor MSE: {}".format(mse))

mae = mean_absolute_error(y_test, y_pred)
print("Tuned linear regressor MAE: {}".format(mae))

Tuned linear regressor MSE: 0.742434698835496
Tuned linear regressor MAE: 0.661129898563508
