In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import RobustScaler
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
class ModifiedSimpleImputer(SimpleImputer):
    def transform(self, X):
        return super().transform(X).flatten()

def getRegressResults(regress, parameters, data):

  # PREPROCESSING
  # Target
  y = data['averageRating'].dropna()

  # Features preprocessing
  X = data.drop('averageRating', axis=1)
  transfo_name = Pipeline(steps=[
        ('imputation', SimpleImputer(strategy='most_frequent')),
        ('transf', TargetEncoder()),
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])
  
  transfo_genres = Pipeline(steps=[
      ('imputation', ModifiedSimpleImputer(strategy='most_frequent')),
      ('vec', CountVectorizer())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns),
          ('genres', transfo_genres, ['genres']),
          ('names', transfo_name, ['directors_name', 'actors', 'actress'])
          
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])

  # Gridsearch
  grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)
  #grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)

  # Fit
  grid.fit(X_train, y_train)

  # Predict
  y_pred = grid.predict(X_test)
  test_score = metrics.r2_score(y_test, y_pred)

  # Results
  regress_results = []
  regress_results.append(grid.cv_results_['mean_fit_time'].mean().round(4))
  regress_results.append(grid.best_score_.round(4))
  regress_results.append(test_score.round(4))
  regress_results.append(grid.best_params_)

  return(regress_results)

def getRandomRegressResults(regress, parameters, data):

  # PREPROCESSING
  # Target
  y = data['averageRating'].dropna()

  # Features preprocessing
  X = data.drop('averageRating', axis=1)
  transfo_name = Pipeline(steps=[
        ('imputation', SimpleImputer(strategy='most_frequent')),
        ('transf', TargetEncoder()),
  ])

  transfo_num = Pipeline(steps=[
      ('imputation', SimpleImputer(strategy='median')),
      ('scaling', RobustScaler())
  ])
  
  transfo_genres = Pipeline(steps=[
      ('imputation', ModifiedSimpleImputer(strategy='most_frequent')),
      ('vec', CountVectorizer())
  ])

  preparation = ColumnTransformer(
      transformers=[
          ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns),
          ('genres', transfo_genres, ['genres']),
          ('names', transfo_name, ['directors_name', 'actors', 'actress'])
          
      ])

  # train-test-split	
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

  # Pipeline and Model
  model = Pipeline(steps=[('preparation', preparation),
                          ('model', regress)])

  # Gridsearch
  random = RandomizedSearchCV(estimator = model, param_distributions= parameters, scoring = 'r2', n_jobs =-1, cv = 5, verbose = 0)
  #grid = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'r2', cv = 5, n_jobs =-1, verbose = 0)

  # Fit
  random.fit(X_train, y_train)

  # Predict
  y_pred = random.predict(X_test)
  test_score = metrics.r2_score(y_test, y_pred)

  # Results
  regress_results = []
  regress_results.append(random.cv_results_['mean_fit_time'].mean().round(4))
  regress_results.append(random.best_score_.round(4))
  regress_results.append(test_score.round(4))
  regress_results.append(random.best_params_)

  return(regress_results)

In [5]:
# Creation du dataset
df_all = pd.read_csv('../datasets/data_regression_sylvine.csv', index_col=0)
df_all = df_all.drop('tconst', axis=1)
df_all.head()

Unnamed: 0,actors,averageRating,numVotes,decade,runtimeMinutes,genres,directors,actress
0,"nm0585785,nm0707778,nm0166547,nm0473134",6.2,31,191.0,45.0,,nm0300487,"nm0631025,nm0003425,nm1270556,nm1495634"
1,"nm0846894,nm1431224,nm3002376",6.0,808,190.0,70.0,"Action,Adventure,Biography",nm0846879,nm0846887
2,"nm0064953,nm0085066,nm0169878,nm0299757,nm1834...",4.6,19,191.0,,Crime,nm0375839,nm0630641
3,"nm0104856,nm0251436,nm0526190,nm5188470,nm5217890",3.5,17,190.0,,"Drama,War",nm0022607,"nm1834127,nm0280746,nm0762935"
4,nm0735618,3.8,21,191.0,58.0,"Adventure,Drama","nm0063413,nm0550220",nm0699807


In [6]:
df_3000 = df_all[df_all['numVotes'] > 3000]
df_3000 = df_3000.drop('numVotes', axis=1).dropna(subset="averageRating")

df_all = df_all.drop('numVotes', axis=1).dropna(subset="averageRating")
df_all.head()

Unnamed: 0,actors,averageRating,decade,runtimeMinutes,genres,directors,actress
0,"nm0585785,nm0707778,nm0166547,nm0473134",6.2,191.0,45.0,,nm0300487,"nm0631025,nm0003425,nm1270556,nm1495634"
1,"nm0846894,nm1431224,nm3002376",6.0,190.0,70.0,"Action,Adventure,Biography",nm0846879,nm0846887
2,"nm0064953,nm0085066,nm0169878,nm0299757,nm1834...",4.6,191.0,,Crime,nm0375839,nm0630641
3,"nm0104856,nm0251436,nm0526190,nm5188470,nm5217890",3.5,190.0,,"Drama,War",nm0022607,"nm1834127,nm0280746,nm0762935"
4,nm0735618,3.8,191.0,58.0,"Adventure,Drama","nm0063413,nm0550220",nm0699807


In [7]:
class ModifiedSimpleImputer(SimpleImputer):
    def transform(self, X):
        return super().transform(X).flatten()

In [8]:
# PREPROCESSING
#choix modele
regress = AdaBoostRegressor(n_estimators=175)
data = df_3000

# Target
y = data['averageRating']

# Features preprocessing
X = data.drop('averageRating', axis=1)

transfo_num = Pipeline(steps=[
    ('imputation', ModifiedSimpleImputer(strategy='median')),
    ('scaling', RobustScaler())
])

transfo_cv = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('vec', CountVectorizer(tokenizer=lambda x: x.split(',')))
])

preparation = ColumnTransformer(
    transformers=[
        ('data_num', transfo_num , X.select_dtypes(exclude=['object']).columns),
        ('cv', transfo_cv, ['genres','actors','directors','actress']),    
    ])

preparation2 = ColumnTransformer(
    transformers=[
        ('cv', transfo_cv, ['genres']),    
    ])    

# train-test-split	
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)	

# Pipeline and Model
model = Pipeline(steps=[('preparation', preparation),
                        ('model', regress)])

model2 = Pipeline(steps=[('preparation', preparation2)]
                        )                        



In [11]:
imputer = SimpleImputer(strategy='most_frequent')
res = imputer.fit_transform(data['genres'])

res.head()
# coun_vect = CountVectorizer(tokenizer=lambda x: x.split(','))
# count_matrix = coun_vect.fit_transform(data['genres'])
# count_matrix

AttributeError: 'Series' object has no attribute 'reshape'

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

# TESTS DE DAVID

In [4]:
model = GradientBoostingRegressor(n_estimators=500, max_depth=4)
parameters = {}

In [6]:
getRegressResults(model, parameters, data=df)

[226.3668, 0.1753, 0.1922, {}]

In [7]:
model = AdaBoostRegressor(n_estimators=175)
parameters = {}

In [8]:
getRegressResults(model, parameters, data=df)

[24.2343, 0.0948, 0.1091, {}]

In [9]:
model = RandomForestRegressor()
parameters = {}

In [10]:
getRegressResults(model, parameters, data=df)

KeyboardInterrupt: 