In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [34]:
class ModifiedSimpleImputer(SimpleImputer):
    def transform(self, X):
        return super().transform(X).flatten()

def getRegressResults(regress, parameters, data):

  # PREPROCESSING
  # Target
  y = data['averageRating'].dropna()

  # Features preprocessing
  X = data.drop('averageRating', axis=1)
  transfo_name = Pipeline(steps=[
        ('imputation', SimpleImputer(strategy='most_frequent')),
        ('transf', TargetEncoder()),
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])
  
  transfo_genres = Pipeline(steps=[
      ('vec', CountVectorizer())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns),
          ('genres', transfo_genres, ['genres']),
          ('names', transfo_name, ['directors_name', 'actors', 'actress'])
          
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])

  # Gridsearch
  grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)
  #grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)

  # Fit
  grid.fit(X_train, y_train)

  # Predict
  y_pred = grid.predict(X_test)
  test_score = metrics.r2_score(y_test, y_pred)

  # Results
  regress_results = []
  regress_results.append(grid.cv_results_['mean_fit_time'].mean().round(4))
  regress_results.append(grid.best_score_.round(4))
  regress_results.append(test_score.round(4))
  regress_results.append(grid.best_params_)

  return(regress_results)

def train_model(regress, data):

  # PREPROCESSING
  # Target
  y = data['averageRating']

  # Features preprocessing
  X = data.drop('averageRating', axis=1)
  transfo_name = Pipeline(steps=[
        ('imputation', SimpleImputer(strategy='most_frequent')),
        ('transf', TargetEncoder()),
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])
  
  transfo_genres = Pipeline(steps=[
      ('vec', CountVectorizer())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns),
          ('genres', transfo_genres, 'genres'),
          ('names', transfo_name, ['directors_name', 'actors', 'actress'])
          
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])
  result = model.fit(X_train, y_train)
  return result

In [35]:
df = pd.read_csv('datasets/data_regression.csv', index_col=0)
df = df.drop('tconst', axis=1)
df['genres'] = df['genres'].str.replace(',', ' ')
df = df.dropna()
df

Unnamed: 0,averageRating,decade,runtimeMinutes,genres,directors_name,actors,actress
1,6.1,191.0,48.0,Comedy,Ernst Lubitsch,Harry Liedtke,Agda Nilsson
8,6.7,192.0,120.0,Adventure Fantasy,Joe May,Olaf Fønss,Mia May
9,6.7,192.0,100.0,Adventure Fantasy,Joe May,Olaf Fønss,Mia May
10,5.7,191.0,80.0,Horror,Joe May,Hermann Picha,Mia May
11,6.0,192.0,67.0,Romance,Fritz Lang,Hans Marr,Mia May
...,...,...,...,...,...,...,...
211617,6.2,202.0,101.0,Drama,Morgan Ingari,Michael Judson Berry,Molly Bernard
211618,6.3,201.0,91.0,Drama War,José Luis Rugeles,Carlos Clavijo,Carmenza González
211619,4.0,196.0,83.0,Horror,Joseph Adler,Ross Harris,Eugenie Wingate
211620,7.7,201.0,91.0,Thriller,Jon-Claire Lee,Asif Khan,Lucy Quill


In [36]:
# model = GradientBoostingRegressor(n_estimators=500, max_depth=4)
# parameters = {}

In [37]:
# model = getRegressResults(model, parameters, data=df)

In [38]:
gbr = GradientBoostingRegressor(n_estimators=500, max_depth=4)
model = train_model(gbr, data=df)

In [None]:
pickle_out = open("model_reg.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close() 