In [169]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [170]:
def getRegressResults(regress, parameters, data):

  # PREPROCESSING
  # Target
  y = data['averageRating'].dropna()

  # Features preprocessing
  X = data.drop(columns='averageRating')
  transfo_cat = Pipeline(steps=[
        ('imputation', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('dr', TruncatedSVD())
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_cat', transfo_cat , X.select_dtypes(include=['object']).columns),
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns)
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])

  # Gridsearch
  grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)
  #grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)

  # Fit
  grid.fit(X_train, y_train)

  # Predict
  y_pred = grid.predict(X_test)
  test_score = metrics.r2_score(y_test, y_pred)

  # Results
  regress_results = []
  regress_results.append(grid.cv_results_['mean_fit_time'].mean().round(4))
  regress_results.append(grid.best_score_.round(4))
  regress_results.append(test_score.round(4))
  regress_results.append(grid.best_params_)

  return(regress_results)

In [171]:
df = pd.read_csv('data_regression.csv', index_col=0)
df = df.drop(['actor_name', 'actress_name'], axis=1).drop_duplicates(subset='tconst')
df

Unnamed: 0,tconst,genres,runtimeMinutes,averageRating,director_name
0,tt19730260,Musical,154.0,9.0,Haricheth
7,tt14691678,Drama,147.0,8.3,S. Mahendar
8,tt12716284,"Comedy,Romance",,5.1,Dinesh Babu
10,tt9130460,"Action,Comedy",,4.7,Om Sai Prakash
11,tt14333040,Drama,153.0,4.3,Om Sai Prakash
...,...,...,...,...,...
900014,tt19712468,Drama,70.0,8.9,S.S. Jishnu Dev
900017,tt1971393,"Action,Drama,Thriller",92.0,5.3,Fernando A. Mico
900020,tt19715754,Documentary,,8.5,Ali Necati Kumcuoglu
900030,tt19727878,"Horror,Mystery,Thriller",118.0,4.8,Any Gacha


In [176]:
model = SGDRegressor()
parameters = {
    "model__loss": ['squared_error']
}

In [177]:
getRegressResults(model, parameters, data=df)

[3.9679, -0.0203, 0.0402, {'model__loss': 'squared_error'}]

In [178]:
model = RandomForestRegressor()
parameters = {
    "model__n_estimators": [n for n in range(50, 150, 50)]
}

In [179]:
getRegressResults(model, parameters, data=df)

[29.1322, 0.0355, 0.0414, {'model__n_estimators': 100}]