In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

In [None]:
def getRegressResults(regress, parameters, data):

  # PREPROCESSING
  # Target
  y = data['averageRating'].dropna()

  # Features preprocessing
  X = data.drop(columns='averageRating')
  transfo_cat = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='most_frequent')),
      ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_cat', transfo_cat , X.select_dtypes(include=['object']).columns),
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns)
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])

  # Gridsearch
  grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'f1_weighted', cv = 5, n_jobs =-1, verbose = 0)
  #grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'f1_weighted', cv = 5, n_jobs =-1, verbose = 0)

  # Fit
  grid.fit(X_train, y_train)

  # Predict
  y_pred = grid.predict(X_test)
  test_score = r2_score(y_test, y_pred)

  # Results
  regress_results = []
  regress_results.append(grid.cv_results_['mean_fit_time'].mean().round(4))
  regress_results.append(grid.best_score_.round(4))
  regress_results.append(test_score.round(4))
  regress_results.append(grid.best_params_)

  return(regress_results)

In [49]:
df = pd.read_csv('data_regression.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900040 entries, 0 to 900039
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          900040 non-null  object 
 1   genres          862127 non-null  object 
 2   runtimeMinutes  786137 non-null  float64
 3   averageRating   900040 non-null  float64
 4   director_name   900040 non-null  object 
 5   actor_name      900040 non-null  object 
 6   actress_name    900040 non-null  object 
dtypes: float64(2), object(5)
memory usage: 54.9+ MB


In [50]:
y = df['averageRating']
y

0         9.0
1         9.0
2         9.0
3         9.0
4         9.0
         ... 
900035    4.8
900036    4.8
900037    6.1
900038    6.1
900039    6.1
Name: averageRating, Length: 900040, dtype: float64

In [51]:
X = df.drop('averageRating',axis=1)
X_num = X.select_dtypes([float])
X_cat = X.select_dtypes([object]).drop('tconst', axis=1)

In [57]:
sorted(SGDRegressor().get_params().keys())

['alpha',
 'average',
 'early_stopping',
 'epsilon',
 'eta0',
 'fit_intercept',
 'l1_ratio',
 'learning_rate',
 'loss',
 'max_iter',
 'n_iter_no_change',
 'penalty',
 'power_t',
 'random_state',
 'shuffle',
 'tol',
 'validation_fraction',
 'verbose',
 'warm_start']