In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error, mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go
import optuna
import xgboost as xgb
from catboost import CatBoostRegressor

  from .autonotebook import tqdm as notebook_tqdm


## Carga de datos procesados

In [2]:
df = pd.read_csv("../data/processed/df_model.csv")

In [3]:
df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,Comedy,...,Germany,Other_country,United Kingdom,United States of America,year,month,day,weekday,day_of_year,gdp
0,237000000,150.437577,2787965087,162.0,7.2,11800,1,1,0,0,...,0,0,1,1,2009,12,10,3,344,14478067
1,300000000,139.082615,961000000,169.0,6.9,4500,1,1,0,0,...,0,0,0,1,2007,5,19,5,139,14474228
2,245000000,107.376788,880674609,148.0,6.3,4466,1,1,0,0,...,0,0,1,1,2015,10,26,0,299,18295019
3,250000000,112.31295,1084939099,165.0,7.6,9106,1,0,0,0,...,0,0,0,1,2012,7,16,0,198,16253970
4,260000000,43.926995,284139100,132.0,6.1,2124,1,1,0,0,...,0,0,0,1,2012,3,7,2,67,16253970


## Modelos

### 1. RandomForest con todas las variables

In [38]:
target = "revenue"
df_X = df.drop(columns=target)
df_y = df[[target]]

In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [40]:
# Initialize the RandomForestRegressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [41]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.560167791023581e+16
r2: 0.7163564310238448
mape: 8.911110962183297


In [42]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,6.838939e+07,59.801320
704,201634991,1.130789e+08,43.919033
244,384335608,2.590680e+08,32.593299
552,74597643,5.331019e+07,28.536366
1164,15992615,4.727641e+07,195.614013
...,...,...,...
808,131282949,1.583716e+08,20.633817
88,938212738,6.930777e+08,26.127871
1144,35564473,2.555845e+07,28.134872
298,219417255,1.906093e+08,13.129305


In [43]:
# Extract feature importances
importances = rf_regressor.feature_importances_

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [None]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [None]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df_X[['year', 'month', 'day']]).reset_index(drop=True)

In [None]:
df_y['date'] = pd.to_datetime(df_X[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

Depende demasiado de variables de votos que en la vida real no se tendrían hasta ya sacada la película

### 2. Random Forest quitando columnas de votos

In [51]:
df.columns

Index(['budget', 'popularity', 'revenue', 'runtime', 'vote_average',
       'vote_count', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science_Fiction', 'Thriller',
       'War', 'Western', 'Other_language', 'en_language', 'es_language',
       'fr_language', 'ja_language', 'zh_language', 'Columbia Pictures',
       'Dune Entertainment', 'New Line Cinema', 'Other_company',
       'Paramount Pictures', 'Relativity Media',
       'Twentieth Century Fox Film Corporation', 'Universal Pictures',
       'Village Roadshow Pictures', 'Walt Disney Pictures', 'Warner Bros.',
       'Canada', 'France', 'Germany', 'Other_country', 'United Kingdom',
       'United States of America', 'year', 'month', 'day', 'weekday',
       'day_of_year', 'gdp'],
      dtype='object')

In [54]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count'])
df_y = df[[target]]

In [64]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

In [65]:
# Initialize the RandomForestRegressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [66]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.8092837474587284e+16
r2: 0.6492481525817935
mape: 12.41894695648598


In [67]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,8.043431e+07,52.721426
704,201634991,1.258930e+08,37.563906
244,384335608,2.256567e+08,41.286548
552,74597643,6.319412e+07,15.286707
1164,15992615,5.217624e+07,226.252081
...,...,...,...
906,75700498,5.999040e+07,20.752970
273,294804195,3.861980e+08,31.001531
427,361366633,2.468854e+08,31.680067
362,56681566,1.208190e+08,113.153885


In [68]:
# Extract feature importances
importances = rf_regressor.feature_importances_*100

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [69]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [70]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)

In [71]:
df_y['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [72]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

### 3. Random Forest quitando columnas de votos y con optimización de hiperparámetros

In [81]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count'])
df_y = df[[target]]

In [82]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

In [73]:
def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 5, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # Inicializar el modelo con los hiperparámetros seleccionados
    rf_regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    # Dividir los datos
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Entrenar el modelo
    rf_regressor.fit(X_train_split, y_train_split)
    
    # Hacer predicciones en el conjunto de validación
    y_pred = rf_regressor.predict(X_valid_split)
    
    # Calcular el error cuadrático medio
    mse = mean_squared_error(y_valid_split, y_pred)
    return mse

# Crear el estudio de Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Mostrar los mejores hiperparámetros
print("Best parameters:", study.best_params)
print("Best MSE:", study.best_value)



[I 2024-11-19 19:21:28,653] A new study created in memory with name: no-name-c544d8a8-c6e9-42f2-a52f-65759aafc359

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[I 2024-11-19 19:21:28,918] Trial 0 finished with value: 1.7708406674195636e+16 and parameters: {'n_estimators': 177, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 1.7708406674195636e+16.

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[I 2024-11-19 19:21:29,236] Trial 1 finished with value: 1.867547080985515e+16 and parameters: {'n_estimators': 252, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 1.7708406674195636e+16.

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_sampl

Best parameters: {'n_estimators': 220, 'max_depth': 23, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best MSE: 1.66055397009914e+16


In [74]:
# Entrenar el modelo final con los mejores hiperparámetros
best_params = study.best_params
rf_best = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    random_state=42
)

rf_best.fit(X_train, y_train)
y_pred = rf_best.predict(X_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [75]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.5861348309927924e+16
r2: 0.6925083072201877
mape: 27.100276782519497


In [76]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,5.906062e+07,65.284691
704,201634991,1.453280e+08,27.925221
244,384335608,2.148293e+08,44.103722
552,74597643,1.633782e+08,119.012596
1164,15992615,3.698187e+07,131.243393
...,...,...,...
906,75700498,6.481959e+07,14.373633
273,294804195,2.517075e+08,14.618749
427,361366633,3.015515e+08,16.552472
362,56681566,1.444819e+08,154.901054


In [77]:
# Extract feature importances
importances = rf_best.feature_importances_*100

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [83]:
rf_best.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_best.predict(df_X)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [84]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)

In [85]:
df_y['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [86]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

### 4. xgboost tuning

In [87]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count', "day"])
df_y = df[[target]]

In [88]:
X_train, X_test_val, y_train, y_test_val = train_test_split(df_X, df_y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)
print("X_train: ",X_train.shape, ", y_train: ",y_train.shape)
print("X_test: ",X_test.shape, ", y_test: ",y_test.shape)
print("X_val: ",X_val.shape, ", y_val: ",y_val.shape)

X_train:  (861, 50) , y_train:  (861, 1)
X_test:  (184, 50) , y_test:  (184, 1)
X_val:  (185, 50) , y_val:  (185, 1)


XGBoost model with parameter tuning:

In [89]:
# XGBoost
def objective(trial,data,target,data_val,target_val):
    
    param = {
        # #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        # 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        # 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        # 'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        # 'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        # 'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        # 'n_estimators': 10000,
        # 'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        # 'random_state': trial.suggest_categorical('random_state', [2020]),
        # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),

        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "reg:squarederror",
        "n_estimators": 10000,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": 124,
        "n_jobs": 8,
    }
    model = xgb.XGBRegressor(**param)
    
    model.fit(data,target,eval_set=[(data_val,target_val)],verbose=False)
    
    preds = model.predict(data_val)
    
    rmse = root_mean_squared_error(target_val, preds)
    
    return rmse

In [90]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X_train,y_train,X_val,y_val), n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-11-19 19:23:26,544] A new study created in memory with name: no-name-2b5b8081-87af-4f38-9d89-c25b4e0425a3

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optu

Number of finished trials: 30
Best trial: {'max_depth': 11, 'learning_rate': 0.00658569112378145, 'colsample_bytree': 0.23040102456303838, 'subsample': 0.7484996737939773, 'alpha': 1.1865070295407734, 'lambda': 4.2087852501025315e-06, 'min_child_weight': 47.05523698454085}


In [91]:
param = study.best_trial.params

model_xgb = xgb.XGBRegressor(**param)
model_xgb.fit(X_train, y_train)
pred_xgb = model_xgb.predict(X_test)
print('XGB MAE:', mean_absolute_error(y_test, pred_xgb))
print('XGB MAPE:', mean_absolute_percentage_error(y_test, pred_xgb))
print('XGB RMSE:', root_mean_squared_error(y_test, pred_xgb))

XGB MAE: 133860519.31521739
XGB MAPE: 96.15955318025672
XGB RMSE: 199571785.84802416


### 5. Catboost

In [8]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count'])
df_y = df[[target]]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [6]:
# Initialize the RandomForestRegressor model
rf_regressor = CatBoostRegressor(random_state=42, verbose=False)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)

In [7]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")
print(f"mae: {mae}")

mse: 1.5567157152287674e+16
r2: 0.7169840296093629
mape: 6.6002033535997775
mae: 77057838.01880844


In [30]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,4.651892e+07,72.656590
704,201634991,1.141011e+08,43.412052
244,384335608,2.523849e+08,34.332160
552,74597643,7.151566e+07,4.131470
1164,15992615,2.671366e+07,67.037460
...,...,...,...
808,131282949,1.420385e+08,8.192633
88,938212738,6.200312e+08,33.913585
1144,35564473,3.123450e+07,12.175003
298,219417255,2.658917e+08,21.180872


In [31]:
# Extract feature importances
importances = rf_regressor.feature_importances_

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [32]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)

In [36]:
# Extract feature importances
importances = rf_regressor.feature_importances_

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [33]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)

In [34]:
df_y['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [35]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

### 6. Catboost tuning

In [14]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count'])
df_y = df[[target]]

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=42)

In [16]:
# Función objetivo para Optuna
def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
    }
    
    # Dividir los datos en entrenamiento y validación
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Inicializar el modelo
    model = CatBoostRegressor(**params, random_state=42, verbose=False)
    
    # Entrenar el modelo
    model.fit(X_train_split, y_train_split)
    
    # Realizar predicciones en el conjunto de validación
    y_pred = model.predict(X_valid_split)
    
    # Calcular el error cuadrático medio
    mse = mean_squared_error(y_valid_split, y_pred)
    return mse

# Crear el estudio de Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Mostrar los mejores hiperparámetros
print("Best parameters:", study.best_params)
print("Best MSE:", study.best_value)

# Entrenar el modelo final con los mejores hiperparámetros
best_params = study.best_params
final_model = CatBoostRegressor(**best_params, random_state=42, verbose=False)

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

[I 2024-11-19 19:34:02,140] A new study created in memory with name: no-name-e349ea0e-00f1-4bbe-88de-5f4edca46a19
[I 2024-11-19 19:34:08,166] Trial 0 finished with value: 2.5648061064873332e+16 and parameters: {'iterations': 970, 'depth': 7, 'learning_rate': 0.001152480312371461, 'l2_leaf_reg': 6.231090998151867, 'bagging_temperature': 0.6244750135268542, 'random_strength': 9.601970578529249, 'border_count': 90}. Best is trial 0 with value: 2.5648061064873332e+16.
[I 2024-11-19 19:34:17,656] Trial 1 finished with value: 1.7616202870997768e+16 and parameters: {'iterations': 660, 'depth': 9, 'learning_rate': 0.0037643248969995487, 'l2_leaf_reg': 2.8288831118477664, 'bagging_temperature': 0.19601873464043817, 'random_strength': 9.968052797716759, 'border_count': 160}. Best is trial 1 with value: 1.7616202870997768e+16.
[I 2024-11-19 19:34:36,366] Trial 2 finished with value: 1.79865548982126e+16 and parameters: {'iterations': 938, 'depth': 10, 'learning_rate': 0.011389651178916622, 'l2_le

Best parameters: {'iterations': 243, 'depth': 7, 'learning_rate': 0.2484312139335508, 'l2_leaf_reg': 7.7500972509738, 'bagging_temperature': 0.9313586337372671, 'random_strength': 8.86255955916866, 'border_count': 212}
Best MSE: 1.5954500368498484e+16


In [17]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.6508878687959422e+16
r2: 0.6799551365705885
mape: 23.514817358757295


In [18]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,2.258799e+07,86.722978
704,201634991,1.254862e+08,37.765675
244,384335608,2.708084e+08,29.538554
552,74597643,2.481653e+07,66.732823
1164,15992615,4.808604e+07,200.676520
...,...,...,...
906,75700498,4.402521e+07,41.842905
273,294804195,3.469122e+08,17.675475
427,361366633,2.416133e+08,33.139004
362,56681566,1.648544e+08,190.843071


In [19]:
# Extract feature importances
importances = rf_regressor.feature_importances_

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [20]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)

In [21]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)

In [22]:
df_y['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)