In [112]:
import numpy as np
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.selection import DropFeatures

from catboost import CatBoostRegressor

# MODELE JP BOX OFFICE

## Importation du pickle

In [113]:
data = pd.read_pickle("input_datasets/dataset-jp.pkl")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4533 entries, 0 to 4534
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   acteurs                   4533 non-null   object        
 1   budget                    4533 non-null   int64         
 2   compositeur               655 non-null    object        
 3   date                      4533 non-null   datetime64[ns]
 4   entrees_premiere_semaine  4533 non-null   int64         
 5   franchise                 4533 non-null   category      
 6   genre                     4533 non-null   category      
 7   pays                      4533 non-null   object        
 8   producteur                585 non-null    object        
 9   realisateur               1926 non-null   object        
 10  remake                    4533 non-null   category      
 11  studio                    4533 non-null   object        
 12  titre                    

## Création des sets

In [114]:
X = data.drop("entrees_premiere_semaine", axis=1)
y = data.entrees_premiere_semaine

In [115]:
cols_drop = ["acteurs", "compositeur", "date", "pays", "producteur", "realisateur", "titre"]

X = X.drop(cols_drop, axis=1)

display(X.info())
display(X.head())

<class 'pandas.core.frame.DataFrame'>
Index: 4533 entries, 0 to 4534
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   budget          4533 non-null   int64   
 1   franchise       4533 non-null   category
 2   genre           4533 non-null   category
 3   remake          4533 non-null   category
 4   studio          4533 non-null   object  
 5   is_compositeur  4533 non-null   category
 6   annee           4533 non-null   category
 7   origine         4533 non-null   category
dtypes: category(6), int64(1), object(1)
memory usage: 133.4+ KB


None

Unnamed: 0,budget,franchise,genre,remake,studio,is_compositeur,annee,origine
0,130000000,1,Fantasy,0,Warner Bros.,1,2004,Etats-Unis
1,150000000,1,Fantasy,0,Warner Bros.,0,2007,Etats-Unis
2,27800000,0,Comédie,0,Pathé,0,2018,France
3,225000000,1,Aventure - Action,0,Walt Disney Pictures,1,2006,Etats-Unis
4,100000000,1,Fantasy,0,Warner Bros.,1,2002,Etats-Unis


In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

## Preprocessing

In [117]:
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["annee"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["annee"]

# Ordinal encoding for ApprovalFY
unique_years = sorted(data["annee"].unique())

In [118]:
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(), cat_cols),
        ("frequency", CountFrequencyEncoder(encoding_method="frequency", missing_values="ignore"), object_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=2000), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

### Obtention des indices de colonnes post processing pour les features catégorielles

In [119]:
pre_fit = preprocessing.fit(X_train)
fit_cols = pre_fit.get_feature_names_out()

cat_indices = []
for i, col_name in enumerate(fit_cols):
    if col_name in cat_cols:
        cat_indices.append(i)

## Pipeline modèle Catboost

### Création pipeline et fitting

In [120]:
catb = CatBoostRegressor(one_hot_max_size=70, verbose=0, cat_features=cat_indices, random_state=42)

pipe_cb = make_pipeline(preprocessing, catb)

In [121]:
pipe_cb.fit(X_train, y_train)

### Métriques

In [122]:
pred_test = pipe_cb.predict(X_test)

score = pipe_cb.score(X_test, y_test)
r2 = r2_score(y_test, pred_test)
mse = mean_squared_error(y_test, pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, pred_test)

print("Score :", score)
print("Score R2 :", r2)
print("Score MSE :", mse)
print("Score RMSE", rmse)
print("Score MAE :", mae)

Score : 0.5434074386098526
Score R2 : 0.5434074386098526
Score MSE : 80296897701.55061
Score RMSE 283367.07236648124
Score MAE : 155231.8914519661




#### Features importance

In [123]:
preprocessed_features = pipe_cb.named_steps['columntransformer'].get_feature_names_out()
cb_model = pipe_cb.named_steps['catboostregressor']

feat_imp = pd.DataFrame({'features': preprocessed_features, 'score': cb_model.feature_importances_})
feat_imp.sort_values(by=['score'], ascending=False, inplace=True)
display(feat_imp)

Unnamed: 0,features,score
28,budget,31.546383
26,origine_France,10.566321
29,annee,10.364965
1,franchise_1,9.985147
0,franchise_0,9.752177
27,studio,9.560517
10,genre_Fantasy,3.2327
6,genre_Comédie,2.932094
22,is_compositeur_0,2.473699
2,genre_Animation,1.668116


### Exportation model catb

In [124]:
with open("models/model_cb.pkl", "wb") as f:
  pickle.dump(pipe_cb, f)
f.close()

## Pipeline modèle Lasso

In [125]:
lasso_model = Lasso(alpha=1, random_state=42)

pipe_lasso = make_pipeline(preprocessing, lasso_model)

display(pipe_lasso)

In [126]:
pipe_lasso.fit(X_train, y_train)

In [127]:
# pipe_lasso.score(X_test, y_test)

## Pipeline modèle Linear Regression

In [128]:
lr_model = LinearRegression()

pipe_lr = make_pipeline(preprocessing, lr_model)

display(pipe_lr)

In [129]:
pipe_lr.fit(X_train, y_train)

In [130]:
# test_score = pipe_lr.score(X_test, y_test)

## Pipeline modèle XGBoost

In [131]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from category_encoders.count import CountEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb

X = data.drop("entrees_premiere_semaine", axis=1)
y = data.entrees_premiere_semaine

# cols_drop = ["acteurs", "compositeur", "date", "pays", "producteur", "realisateur", "titre"]
# Liste des colonnes à garder
cols_keep = ["budget", "franchise", "genre", "remake", "studio", "is_compositeur", "annee", "origine"]
X = X[cols_keep]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["annee"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["annee"]

# Ordinal encoding for ApprovalFY
unique_years = sorted(data["annee"].unique())
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(), cat_cols),
        ("frequency", CountEncoder(), object_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=2000), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

xgb_model = xgb.XGBRegressor(random_state=42)

# Définition de la grille des hyperparamètres à tester
# {'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 
#'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01,
# 'xgbregressor__scale_pos_weight': 1, 'xgbregressor__subsample': 1.0}

param_grid = {
    'xgbregressor__n_estimators': [100],#[50, 100, 150],
    'xgbregressor__learning_rate': [0.1],#[0.5, 0.1, 1.5],
    'xgbregressor__max_depth': [4],#[3.4, 4, 4.5],
    'xgbregressor__min_child_weight': [0.5],#[0.5, 1, 2],
    'xgbregressor__gamma': [0],#[0.0, 0.01, 0.02],
    'xgbregressor__subsample': [1.0],#[0.8, 0.9, 1.0],
    'xgbregressor__colsample_bytree': [0.9],#[0.8, 0.9, 1.0],
    'xgbregressor__reg_alpha': [0.05],#[0, 0.001, 0.005, 0.01, 0.05],
    'xgbregressor__reg_lambda': [0.01],#[0, 0.001, 0.005, 0.01, 0.05],
    'xgbregressor__scale_pos_weight': [0],#[1, 2, 3, 4, 5],

    # Ajoutez d'autres hyperparamètres à tester selon vos besoins
}

# Création du pipeline
pipe_xgb = make_pipeline(preprocessing, xgb_model)

# Recherche par grille des meilleurs hyperparamètres
grid_search = GridSearchCV(pipe_xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Affichage des meilleurs hyperparamètres
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

# Prédiction sur l'ensemble de test avec les meilleurs paramètres
pred_test = grid_search.predict(X_test)

# Calcul des métriques d'évaluation
score = grid_search.score(X_test, y_test)
r2 = r2_score(y_test, pred_test)
mse = mean_squared_error(y_test, pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, pred_test)

print("Score :", score)
print("Score R2 :", r2)
print("Score MSE :", mse)
print("Score RMSE", rmse)
print("Score MAE :", mae)

Meilleurs hyperparamètres trouvés :
{'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01, 'xgbregressor__scale_pos_weight': 0, 'xgbregressor__subsample': 1.0}
Score : -76483406297.7184
Score R2 : 0.5650921095966052
Score MSE : 76483406297.7184
Score RMSE 276556.33476331434
Score MAE : 157417.90915761524


In [132]:
display(X)

Unnamed: 0,budget,franchise,genre,remake,studio,is_compositeur,annee,origine
0,130000000,1,Fantasy,0,Warner Bros.,1,2004,Etats-Unis
1,150000000,1,Fantasy,0,Warner Bros.,0,2007,Etats-Unis
2,27800000,0,Comédie,0,Pathé,0,2018,France
3,225000000,1,Aventure - Action,0,Walt Disney Pictures,1,2006,Etats-Unis
4,100000000,1,Fantasy,0,Warner Bros.,1,2002,Etats-Unis
...,...,...,...,...,...,...,...,...
4530,74982618,1,Comédie,0,Pathé,0,2008,France
4531,50300000,1,Comédie,0,Pathé,0,2002,France
4532,260000000,0,Aventure - Action,0,Walt Disney Pictures,1,2019,Etats-Unis
4533,125000000,1,Fantasy,0,Warner Bros.,1,2011,Etats-Unis


## Pipeline modele Catboost

In [138]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
# from category_encoders import CountFrequencyEncoder

# Vos données X et y
X = data.drop("entrees_premiere_semaine", axis=1)
y = data.entrees_premiere_semaine
cols_drop = ["acteurs", "compositeur", "date", "pays", "producteur", "realisateur", "titre"]
X = X.drop(cols_drop, axis=1)

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

# Définition des colonnes catégorielles, numériques et année
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["annee"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["annee"]

# Définition du prétraitement des données
unique_years = sorted(data["annee"].unique())
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(), cat_cols),
        ("frequency", CountFrequencyEncoder(encoding_method="frequency", missing_values="ignore"), object_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=2000), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

# Création du pipeline
pipe_cb = make_pipeline(preprocessing, CatBoostRegressor(one_hot_max_size=70, verbose=0, random_state=42))

# Définition de la grille d'hyperparamètres à rechercher
# {'catboostregressor__depth': 7, 'catboostregressor__l2_leaf_reg': 0.6, 'catboostregressor__learning_rate': 0.055, 
#  'catboostregressor__n_estimators': 154, 'catboostregressor__random_strength': 0.31}

param_grid = {
    "catboostregressor__learning_rate": [0.045, 0.05, 0.055],
    "catboostregressor__depth": [6.5, 7, 7.5],
    "catboostregressor__l2_leaf_reg": [0.5, 0.5, 0.6],
    "catboostregressor__n_estimators": [145, 149, 154],
    "catboostregressor__random_strength": [0.29, 0.30, 0.31],
}

# Recherche des meilleurs hyperparamètres avec validation croisée
grid_search = GridSearchCV(pipe_cb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Affichage des meilleurs hyperparamètres et de la performance
print("Meilleurs hyperparamètres trouvés:")
print(grid_search.best_params_)
print("Meilleur score sur les données d'entraînement:", grid_search.best_score_)

# Prédiction sur les données de test avec les meilleurs paramètres
pred_test = grid_search.best_estimator_.predict(X_test)

# Calcul des métriques de performance
score = grid_search.best_estimator_.score(X_test, y_test)
r2 = r2_score(y_test, pred_test)
mse = mean_squared_error(y_test, pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, pred_test)

print("Score :", score)
print("Score R2 :", r2)
print("Score MSE :", mse)
print("Score RMSE", rmse)
print("Score MAE :", mae)


810 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **l

Meilleurs hyperparamètres trouvés:
{'catboostregressor__depth': 7, 'catboostregressor__l2_leaf_reg': 0.6, 'catboostregressor__learning_rate': 0.055, 'catboostregressor__n_estimators': 154, 'catboostregressor__random_strength': 0.31}
Meilleur score sur les données d'entraînement: -81314361268.16324
Score : 0.5635199796011344
Score R2 : 0.5635199796011344
Score MSE : 76759882903.10925
Score RMSE 277055.7397043224
Score MAE : 156103.25067787166




## Pipeline XGBoost par pays: Etats-Unis, France, other

In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from category_encoders.count import CountEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb

X = data.drop("entrees_premiere_semaine", axis=1)
y = data.entrees_premiere_semaine

# cols_drop = ["acteurs", "compositeur", "date", "pays", "producteur", "realisateur", "titre"]
# Liste des colonnes à garder
cols_keep = ["budget", "franchise", "genre", "remake", "studio", "is_compositeur", "annee", "origine"]
X = X[cols_keep]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["annee"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["annee"]

# Ordinal encoding for ApprovalFY
unique_years = sorted(data["annee"].unique())
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(), cat_cols),
        ("frequency", CountEncoder(), object_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=2000), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

xgb_model = xgb.XGBRegressor(random_state=42)

# Définition de la grille des hyperparamètres à tester
# {'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 
#'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01,
# 'xgbregressor__scale_pos_weight': 1, 'xgbregressor__subsample': 1.0}

param_grid = {
    'xgbregressor__n_estimators': [100],#[50, 100, 150],
    'xgbregressor__learning_rate': [0.1],#[0.5, 0.1, 1.5],
    'xgbregressor__max_depth': [4],#[3.4, 4, 4.5],
    'xgbregressor__min_child_weight': [0.5],#[0.5, 1, 2],
    'xgbregressor__gamma': [0],#[0.0, 0.01, 0.02],
    'xgbregressor__subsample': [1.0],#[0.8, 0.9, 1.0],
    'xgbregressor__colsample_bytree': [0.9],#[0.8, 0.9, 1.0],
    'xgbregressor__reg_alpha': [0.05],#[0, 0.001, 0.005, 0.01, 0.05],
    'xgbregressor__reg_lambda': [0.01],#[0, 0.001, 0.005, 0.01, 0.05],
    'xgbregressor__scale_pos_weight': [0],#[1, 2, 3, 4, 5],

    # Ajoutez d'autres hyperparamètres à tester selon vos besoins
}

# Création du pipeline
pipe_xgb = make_pipeline(preprocessing, xgb_model)

# Séparation des données par origine
X_usa = X_train[X_train['origine'] == 'Etats-Unis']
y_usa = y_train[X_train['origine'] == 'Etats-Unis']
X_france = X_train[X_train['origine'] == 'France']
y_france = y_train[X_train['origine'] == 'France']
X_other = X_train[(X_train['origine'] != 'Etats-Unis') & (X_train['origine'] != 'France')]
y_other = y_train[(X_train['origine'] != 'Etats-Unis') & (X_train['origine'] != 'France')]

# Entraînement du modèle pour les films originaires des États-Unis
grid_search_usa = GridSearchCV(pipe_xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_usa.fit(X_usa, y_usa)
print("Meilleurs hyperparamètres pour les films originaires des États-Unis:")
print(grid_search_usa.best_params_)

# Entraînement du modèle pour les films originaires de France
grid_search_france = GridSearchCV(pipe_xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_france.fit(X_france, y_france)
print("Meilleurs hyperparamètres pour les films originaires de France:")
print(grid_search_france.best_params_)

# Entraînement du modèle pour les films originaires d'autres pays
grid_search_other = GridSearchCV(pipe_xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_other.fit(X_other, y_other)
print("Meilleurs hyperparamètres pour les films originaires d'autres pays:")
print(grid_search_other.best_params_)

# Prédictions pour les films originaires des États-Unis
pred_usa = grid_search_usa.predict(X_test[X_test['origine'] == 'Etats-Unis'])

# Prédictions pour les films originaires de France
pred_france = grid_search_france.predict(X_test[X_test['origine'] == 'France'])

# Prédictions pour les films originaires d'autres pays
pred_other = grid_search_other.predict(X_test[(X_test['origine'] != 'Etats-Unis') & (X_test['origine'] != 'France')])

print(" ")
# Calcul des métriques pour chaque groupe
mse_usa = mean_squared_error(y_test[X_test['origine'] == 'Etats-Unis'], pred_usa)
r2_usa = r2_score(y_test[X_test['origine'] == 'Etats-Unis'], pred_usa)
print("Score pour les films originaires des États-Unis :", r2_usa)
print("Score RMSE pour les films originaires des États-Unis :", np.sqrt(mse_usa))

mse_france = mean_squared_error(y_test[X_test['origine'] == 'France'], pred_france)
r2_fr = r2_score(y_test[X_test['origine'] == 'France'], pred_france)
print("Score pour les films originaires de France :", r2_fr)
print("Score RMSE pour les films originaires de France :", np.sqrt(mse_france))

mse_other = mean_squared_error(y_test[(X_test['origine'] != 'Etats-Unis') & (X_test['origine'] != 'France')], pred_other)
r2_other = r2_score(y_test[(X_test['origine'] != 'Etats-Unis') & (X_test['origine'] != 'France')], pred_other)
print("Score pour les films originaires d'autres pays :", r2_other)
print("Score RMSE pour les films originaires d'autres pays :", np.sqrt(mse_other))

Meilleurs hyperparamètres pour les films originaires des États-Unis:
{'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01, 'xgbregressor__scale_pos_weight': 0, 'xgbregressor__subsample': 1.0}


Traceback (most recent call last):
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
  File "/Users/cyriljulliard/simplon/Movie_Popularity_Prediction/env/lib/python3.10/site-packages/sklearn/utils/_response.py", line 238, in _get

Meilleurs hyperparamètres pour les films originaires de France:
{'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01, 'xgbregressor__scale_pos_weight': 0, 'xgbregressor__subsample': 1.0}
Meilleurs hyperparamètres pour les films originaires d'autres pays:
{'xgbregressor__colsample_bytree': 0.9, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 0.5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_alpha': 0.05, 'xgbregressor__reg_lambda': 0.01, 'xgbregressor__scale_pos_weight': 0, 'xgbregressor__subsample': 1.0}
 
Score pour les films originaires des États-Unis : 0.5560266162947329
Score RMSE pour les films originaires des États-Unis : 280711.0643700005
Score pour les films originaires de Fran