In [66]:
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.selection import DropFeatures

from catboost import CatBoostRegressor

# MODELE JP BOX OFFICE

## Importation du pickle

In [67]:
data = pd.read_pickle("datasets/dataset-jp.pkl")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4533 entries, 0 to 4534
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   acteurs                   4533 non-null   object        
 1   budget                    4533 non-null   int64         
 2   compositeur               655 non-null    object        
 3   date                      4533 non-null   datetime64[ns]
 4   entrees_premiere_semaine  4533 non-null   int64         
 5   franchise                 4533 non-null   category      
 6   genre                     4533 non-null   category      
 7   pays                      4533 non-null   object        
 8   producteur                585 non-null    object        
 9   realisateur               1926 non-null   object        
 10  remake                    4533 non-null   category      
 11  studio                    4533 non-null   object        
 12  titre                    

## Création des sets

In [68]:
X = data.drop("entrees_premiere_semaine", axis=1)
y = data.entrees_premiere_semaine

In [69]:
cols_drop = ["acteurs", "compositeur", "date", "pays", "producteur", "realisateur", "titre"]

X = X.drop(cols_drop, axis=1)

display(X.info())
display(X.head())

<class 'pandas.core.frame.DataFrame'>
Index: 4533 entries, 0 to 4534
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   budget          4533 non-null   int64   
 1   franchise       4533 non-null   category
 2   genre           4533 non-null   category
 3   remake          4533 non-null   category
 4   studio          4533 non-null   object  
 5   is_compositeur  4533 non-null   category
 6   annee           4533 non-null   category
 7   origine         4533 non-null   category
dtypes: category(6), int64(1), object(1)
memory usage: 133.4+ KB


None

Unnamed: 0,budget,franchise,genre,remake,studio,is_compositeur,annee,origine
0,130000000,1,Fantasy,0,Warner Bros.,1,2004,Etats-Unis
1,150000000,1,Fantasy,0,Warner Bros.,0,2007,Etats-Unis
2,27800000,0,Comédie,0,Pathé,0,2018,France
3,225000000,1,Aventure - Action,0,Walt Disney Pictures,1,2006,Etats-Unis
4,100000000,1,Fantasy,0,Warner Bros.,1,2002,Etats-Unis


In [70]:
X_train, y_train, X_test, y_test = train_test_split(X, y, shuffle=True, test_size=0.15, random_state=42, stratify=X["genre"])

## Preprocessing

In [71]:
object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["annee"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["annee"]

# Ordinal encoding for ApprovalFY
unique_years = sorted(data["annee"].unique())

In [72]:
preprocessing = ColumnTransformer([
        ("onehot", OneHotEncoder(), cat_cols),
        ("frequency", CountFrequencyEncoder(encoding_method="frequency", missing_values="ignore"), object_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=2000), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

### Obtention des indices de colonnes post processing pour les features catégorielles

In [73]:
pre_fit = preprocessing.fit(X_train)
fit_cols = pre_fit.get_feature_names_out()

cat_indices = []
for i, col_name in enumerate(fit_cols):
    if col_name in cat_cols:
        cat_indices.append(i)

## Pipeline modèle Catboost

In [74]:
catb = CatBoostRegressor(one_hot_max_size=70, verbose=0, cat_features=cat_indices, random_state=42)

pipeline_cb = make_pipeline(preprocessing, catb)

In [75]:
pipeline_cb.fit(X_train, y_train)

CatBoostError: Length of label=680 and length of data=3853 is different.

## Pipeline modèle Lasso

In [None]:
lasso_model = Lasso(alpha=1, random_state=42)

pipe_lasso = make_pipeline(preprocessing, lasso_model)

display(pipe_lasso)

In [None]:
pipe_lasso.fit(X_train, y_train)

ValueError: could not convert string to float: 'Science Fiction'

In [None]:
pipe_lasso.score(X_test, y_test)

## Pipeline modèle Linear Regression

In [64]:
lr_model = LinearRegression()

pipe_lr = make_pipeline(preprocessing, lr_model)

display(pipe_lr)

In [65]:
pipe_lr.fit(X_train, y_train)

ValueError: could not convert string to float: 'Science Fiction'