In [139]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from hyperopt import fmin, tpe, hp, anneal, Trials

import numpy as np

from sklearn.model_selection import KFold, cross_val_score

random_state=42
num_folds=2
kf = KFold(n_splits=num_folds, random_state=random_state)

In [2]:
train_data = pd.read_csv('../data/train_final.csv')
val_data = pd.read_csv('../data/val_final.csv')
test_data = pd.read_csv('../data/test_final.csv')

In [3]:
data = pd.concat([train_data, val_data, test_data])

In [125]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=random_state, shuffle=True)

In [126]:
print(train_data.shape)
print(test_data.shape)
train_X = train_data.drop(columns=['posterID','imdb_score'])
test_X = test_data.drop(columns=['posterID','imdb_score'])

(15080, 5910)
(3770, 5910)


In [127]:
cols = list(map(str, range(len(train_X.columns))))

In [128]:
train_X.columns = cols
test_X.columns = cols

## Kaggle 

In [133]:
def gb_mse_cv(params, random_state=random_state, cv=kf, X=train_X, y=train_data['imdb_score']):
    # the function gets a set of variable parameters in "param"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'learning_rate': params['learning_rate']}
    
    # we use this params to create a new LGBM Regressor
    model = lgb.LGBMRegressor(random_state=random_state, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1).mean()

    return score

In [136]:
%%time
n_iter = 10
# possible values of parameters
space={'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'learning_rate': hp.loguniform('learning_rate', -5, 0)
      }

# trials will contain logging information
trials = Trials()

best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = lgb.LGBMRegressor(random_state=random_state, n_estimators=int(best['n_estimators']),
                      max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])
model.fit(train_X, train_data['imdb_score'])
tpe_test_score=mean_squared_error(test_data['imdb_score'], model.predict(test_X))

print("Best MSE {:.3f} params {}".format( gb_mse_cv(best), best))

100%|██████████| 10/10 [04:12<00:00, 25.24s/trial, best loss: 0.883686585070359]
Best MSE 0.884 params {'learning_rate': 0.009516355818546441, 'max_depth': 17.0, 'n_estimators': 964.0}
CPU times: user 1min 22s, sys: 3.76 s, total: 1min 26s
Wall time: 4min 47s


In [137]:
y_pred = model.predict(test_X)
print('The mse of prediction on test is: ', mean_squared_error(test_data['imdb_score'], y_pred))
print('r2 score on test is: ', r2_score(test_data['imdb_score'], y_pred))

The mse of prediction on test is:  0.8067169142666126
r2 score on test is:  0.4513371218065787


In [140]:
model.booster_.save_model('lightgbm_model.txt')

<lightgbm.basic.Booster at 0x7fec499232d0>

In [141]:
reg = lgb.Booster(model_file='lightgbm_model.txt')

In [142]:
y_pred = reg.predict(test_X)
print('The mse of prediction on test is: ', mean_squared_error(test_data['imdb_score'], y_pred))
print('r2 score on test is: ', r2_score(test_data['imdb_score'], y_pred))

The mse of prediction on test is:  0.8067169142666126
r2 score on test is:  0.4513371218065787


In [143]:
y_pred1 = model.predict(train_X)
print('The mse of prediction on train is: ', mean_squared_error(train_data['imdb_score'], y_pred1))
print('r2 score on train is: ', r2_score(train_data['imdb_score'], y_pred1))

The mse of prediction on train is:  0.648812785414066
r2 score on train is:  0.5592180572657477
