In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error, mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go
import optuna
import xgboost as xgb

In [18]:
df = pd.read_csv("../data/processed/df_model.csv")

In [19]:
df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,Comedy,...,Germany,Other_country,United Kingdom,United States of America,year,month,day,weekday,day_of_year,real_gdp
0,237000000,150.437577,2787965087,162.0,7.2,11800,1,1,0,0,...,0,0,1,1,2009,12,10,3,344,16349100
1,300000000,139.082615,961000000,169.0,6.9,4500,1,1,0,0,...,0,0,0,1,2007,5,19,5,139,16762400
2,245000000,107.376788,880674609,148.0,6.3,4466,1,1,0,0,...,0,0,1,1,2015,10,26,0,299,18799600
3,250000000,112.31295,1084939099,165.0,7.6,9106,1,0,0,0,...,0,0,0,1,2012,7,16,0,198,17442800
4,260000000,43.926995,284139100,132.0,6.1,2124,1,1,0,0,...,0,0,0,1,2012,3,7,2,67,17442800


In [75]:
target = "revenue"
df_X = df.drop(columns=target)
df_y = df[[target]]

In [76]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [77]:
# Initialize the RandomForestRegressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [80]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.5571457990414312e+16
r2: 0.7169058389760959
mape: 9.416478607792484


In [93]:
df_X[df_X.index==1074]

Unnamed: 0,budget,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,...,Germany,Other_country,United Kingdom,United States of America,year,month,day,weekday,day_of_year,real_gdp
1074,6000000,34.917447,108.0,7.2,837,1,0,0,0,1,...,0,0,0,1,1999,1,22,4,22,13543800


In [79]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,6.489577e+07,61.854844
704,201634991,1.135467e+08,43.687009
244,384335608,2.533689e+08,34.076139
552,74597643,5.169058e+07,30.707483
1164,15992615,4.932078e+07,208.397217
...,...,...,...
808,131282949,1.600983e+08,21.949035
88,938212738,6.848676e+08,27.002957
1144,35564473,2.950633e+07,17.034251
298,219417255,1.844142e+08,15.952752


In [29]:
# Extract feature importances
importances = rf_regressor.feature_importances_

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [13]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(X_test[['year', 'month', 'day']]).reset_index(drop=True)

In [15]:
y_test['date'] = pd.to_datetime(X_test[['year', 'month', 'day']]).reset_index(drop=True)

In [16]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = y_test.groupby(y_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

In [8]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)

  return fit_method(estimator, *args, **kwargs)


In [9]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df_X[['year', 'month', 'day']]).reset_index(drop=True)

In [10]:
df_y['date'] = pd.to_datetime(df_X[['year', 'month', 'day']]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_y['date'] = pd.to_datetime(df_X[['year', 'month', 'day']]).reset_index(drop=True)


In [11]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

#### Quitando columnas de votos, y algunas de fechas

In [120]:
df.columns

Index(['budget', 'popularity', 'revenue', 'runtime', 'vote_average',
       'vote_count', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science_Fiction', 'Thriller',
       'War', 'Western', 'Other_language', 'en_language', 'es_language',
       'fr_language', 'ja_language', 'zh_language', 'Columbia Pictures',
       'Dune Entertainment', 'New Line Cinema', 'Other_company',
       'Paramount Pictures', 'Relativity Media',
       'Twentieth Century Fox Film Corporation', 'Universal Pictures',
       'Village Roadshow Pictures', 'Walt Disney Pictures', 'Warner Bros.',
       'Canada', 'France', 'Germany', 'Other_country', 'United Kingdom',
       'United States of America', 'year', 'month', 'day', 'weekday',
       'day_of_year', 'real_gdp'],
      dtype='object')

In [101]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count', "day"])
df_y = df[[target]]

In [102]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [103]:
# Initialize the RandomForestRegressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [104]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"mse: {mse}")
print(f"r2: {r2}")
print(f"mape: {mape}")

mse: 1.8592924999891696e+16
r2: 0.6619745879245914
mape: 12.345118966662602


In [105]:
test_preds_df = pd.DataFrame({'y_real': y_test['revenue'], 'y_pred': y_pred})
test_preds_df['mape'] = abs(test_preds_df['y_real']-test_preds_df['y_pred'])/test_preds_df['y_real']*100
test_preds_df

Unnamed: 0,y_real,y_pred,mape
548,170128460,6.562693e+07,61.425074
704,201634991,1.584979e+08,21.393659
244,384335608,2.670424e+08,30.518435
552,74597643,5.273621e+07,29.305802
1164,15992615,4.164977e+07,160.431270
...,...,...,...
808,131282949,1.515745e+08,15.456370
88,938212738,6.292818e+08,32.927601
1144,35564473,2.646171e+07,25.595099
298,219417255,2.048990e+08,6.616730


In [106]:
# Extract feature importances
importances = rf_regressor.feature_importances_*100

# Create a DataFrame for easy visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = go.Figure(go.Bar(
    x=importance_df['Feature'],
    y=importance_df['Importance'],
    text=importance_df['Importance'].round(3),  # Show importance values on bars
    textposition='auto'  # Display text directly on top of bars
))

# Customize layout
fig.update_layout(
    title="Feature Importances from Random Forest",
    xaxis_title="Feature",
    yaxis_title="Importance",
    template="plotly_white",
)

# Show plot
fig.show()

In [49]:
rf_regressor.fit(df_X, df_y)

# Predict on the test set
y_pred = rf_regressor.predict(df_X)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [51]:
df_pred_test = pd.DataFrame(y_pred)
df_pred_test.columns = ["revenue"]
df_pred_test['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)

In [52]:
df_y['date'] = pd.to_datetime(df[['year', 'month', 'day']]).reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [53]:
df_aux = df_pred_test.groupby(df_pred_test["date"].dt.year)['revenue'].mean().reset_index()[:-1]
df_aux2 = df_y.groupby(df_y["date"].dt.year)['revenue'].mean().reset_index()[:-1]


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_aux['date'], y=df_aux['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='steelblue'),
    name = 'Revenue'
))
fig.add_trace(go.Scatter(
    x=df_aux2['date'], y=df_aux2['revenue'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='firebrick'),
    name = 'Revenue real'
))

fig.update_layout(
    title = 'Evolución Temporal',
    xaxis_title = 'Año',
    yaxis_title = 'Ventas (Millones)',
    legend=dict(
        bgcolor = 'white',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01),
    font=dict(size=8),
    # margin=dict(
    #     l=0,
    #     r=0,
    #     b=0,
    #     t=25,
    #     pad=4
    # )
)

#### xgboost tuning

In [None]:
target = "revenue"
df_X = df.drop(columns=[target, "vote_average",'vote_count', "day"])
df_y = df[[target]]

In [108]:
X_train, X_test_val, y_train, y_test_val = train_test_split(df_X, df_y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)
print("X_train: ",X_train.shape, ", y_train: ",y_train.shape)
print("X_test: ",X_test.shape, ", y_test: ",y_test.shape)
print("X_val: ",X_val.shape, ", y_val: ",y_val.shape)

X_train:  (861, 50) , y_train:  (861, 1)
X_test:  (184, 50) , y_test:  (184, 1)
X_val:  (185, 50) , y_val:  (185, 1)


XGBoost model with parameter tuning:

In [114]:
# XGBoost
def objective(trial,data,target,data_val,target_val):
    
    param = {
        # #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        # 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        # 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        # 'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        # 'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        # 'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        # 'n_estimators': 10000,
        # 'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        # 'random_state': trial.suggest_categorical('random_state', [2020]),
        # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),

        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "reg:squarederror",
        "n_estimators": 10000,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": 124,
        "n_jobs": 8,
    }
    model = xgb.XGBRegressor(**param)
    
    model.fit(data,target,eval_set=[(data_val,target_val)],verbose=False)
    
    preds = model.predict(data_val)
    
    rmse = root_mean_squared_error(target_val, preds)
    
    return rmse

In [115]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X_train,y_train,X_val,y_val), n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-11-16 17:06:47,116] A new study created in memory with name: no-name-00ed4dde-7e8c-4d6c-83a4-a4de8ff49225

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optu

Number of finished trials: 30
Best trial: {'max_depth': 5, 'learning_rate': 0.005253768412247795, 'colsample_bytree': 0.3756708375703581, 'subsample': 0.6379225675331539, 'alpha': 0.6249702499344099, 'lambda': 1.1948615973713107e-08, 'min_child_weight': 29.839185123016218}


In [116]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,150019900.0,2024-11-16 17:06:47.121413,2024-11-16 17:06:53.176154,0 days 00:00:06.054741,0.815096,0.512677,2.910896e-05,0.019023,7,45.343207,0.447488,COMPLETE
1,1,142908700.0,2024-11-16 17:06:53.178151,2024-11-16 17:07:04.636169,0 days 00:00:11.458018,0.096257,0.282567,2.027915e-08,0.014688,11,21.885608,0.498893,COMPLETE
2,2,166043100.0,2024-11-16 17:07:04.639169,2024-11-16 17:07:12.090681,0 days 00:00:07.451512,0.323854,0.206605,0.001310248,0.028421,10,184.990886,0.467533,COMPLETE
3,3,142533600.0,2024-11-16 17:07:12.093688,2024-11-16 17:07:23.441799,0 days 00:00:11.348111,2.771746,0.567418,0.03965429,0.012583,10,41.398423,0.741648,COMPLETE
4,4,214749100.0,2024-11-16 17:07:23.443799,2024-11-16 17:07:28.613533,0 days 00:00:05.169734,0.152862,0.227435,0.003319713,0.009257,12,914.617177,0.704942,COMPLETE
5,5,142995600.0,2024-11-16 17:07:28.614536,2024-11-16 17:07:40.271069,0 days 00:00:11.656533,0.110846,0.305387,0.0001777806,0.015629,11,23.974108,0.512023,COMPLETE
6,6,214749100.0,2024-11-16 17:07:40.274068,2024-11-16 17:07:44.969062,0 days 00:00:04.694994,0.038758,0.460282,0.4755777,0.006605,7,873.956847,0.732181,COMPLETE
7,7,214749100.0,2024-11-16 17:07:44.971061,2024-11-16 17:07:50.696401,0 days 00:00:05.725340,0.152764,0.403941,2.053767,0.021588,5,713.725051,0.554681,COMPLETE
8,8,147115100.0,2024-11-16 17:07:50.697401,2024-11-16 17:08:00.099605,0 days 00:00:09.402204,0.220656,0.361897,0.02606291,0.017117,12,20.477887,0.518222,COMPLETE
9,9,146759500.0,2024-11-16 17:08:00.101606,2024-11-16 17:08:08.031136,0 days 00:00:07.929530,0.010693,0.402043,5.006451e-07,0.022064,11,49.160421,0.523808,COMPLETE


In [119]:
# North American Sales Optimized Parameters:
param = study.best_trial.params

model_xgb = xgb.XGBRegressor(**param)
model_xgb.fit(X_train, y_train)
pred_xgb = model_xgb.predict(X_test)
print('XGB MAE:', mean_absolute_error(y_test, pred_xgb))
print('XGB MAPE:', mean_absolute_percentage_error(y_test, pred_xgb))
print('XGB RMSE:', root_mean_squared_error(y_test, pred_xgb))

XGB MAE: 131396231.85869566
XGB MAPE: 92.5839272228366
XGB RMSE: 195474978.15932322
