In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

In [23]:
model_dir = 'models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# load and clean data
path = "input/dati_cinema_updated.csv"
df = pd.read_csv(path, encoding="UTF-8", sep=",", dayfirst=True, on_bad_lines="skip")

target = "total"  # si può modificare a piacimento con "full_price" o "reduced"

colonne_da_escludere = [
    "full_price", "reduced", "free", "total",       # biglietti
    "title", "date", "time", "datetime", "date_str"  # metadati
    #"cast", "director", "keywords", "genres"          # testo non gestito
]

df["weekend"] = df["weekend"].astype(int)

# Riempie eventuali NaN nei numerici
num_cols = ["temp_max", "temp_min", "precip_mm", "vote_average", "popularity"]
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

df.sample(3)

Unnamed: 0,date,time,title,full_price,reduced,free,total,datetime,giorno_settimana,mese,...,date_str,temp_max,temp_min,precip_mm,genres,keywords,cast,director,vote_average,popularity
190,2022-11-05,20:30,Black Adam,12.0,27.0,1.0,40.0,2022-11-05 20:30:00,sabato,11,...,2022-11-05,14.7,3.5,0.0,"['Action', 'Adventure', 'Science Fiction']","['lightning', 'superhero', 'anti hero', 'based...","['Dwayne Johnson', 'Aldis Hodge', 'Noah Centin...",Jaume Collet-Serra,6.857,11.1397
302,2023-05-21,20:30,Il Sol dell'Avvenire,18.0,16.0,0.0,34.0,2023-05-21 20:30:00,domenica,5,...,2023-05-21,23.7,12.6,1.6,['Comedy'],"['melancholy', 'filmmaking', 'critical']","['Nanni Moretti', 'Margherita Buy', 'Silvio Or...",Nanni Moretti,7.0,0.6298
85,2020-01-26,20:30,Pinocchio,25.0,28.0,0.0,53.0,2020-01-26 20:30:00,domenica,1,...,2020-01-26,9.3,1.5,0.0,"['Adventure', 'Drama', 'Fantasy']",['musical'],"['Bob Hoskins', 'Robbie Kay', 'Luciana Littizz...",Alberto Sironi,6.0,0.7359


In [24]:
# partiamo dal tuo X “grezzo”, cioè senza target né colonne da escludere
X = df.drop(columns=colonne_da_escludere + [target])

# 1️⃣ Categorie nominali
categorical_cols = ["fascia_oraria", "giorno_settimana", "stagione", "festività",'genres', 'keywords', 'cast', 'director']

# 2️⃣ Numeriche “pure”
# scartiamo eventuali colonne non numeriche o già encodate:
all_nums = X.select_dtypes(include="number").columns.tolist()
# fra queste, togliamo quelle che non vogliamo log-transform o imputare
numeric_cols = [
    c for c in all_nums
    if c not in ["weekend"]  # weekend è già 0/1, lo lasciamo tra numeriche ma senza log
]

# 3️⃣ (Facoltativo) Cicliche: mese, ora
# se vuoi trasformarle in sin/cos, considera di metterle in una lista a parte
cyclic_cols = ["mese", "ora", "giorno_settimana"]  # es. 0–6 per i giorni


In [26]:
# ------------------ 🧪 Log-transform delle feature sbilanciate ------------------
X = df.drop(columns=colonne_da_escludere + [target])

numeriche = X.select_dtypes(include=np.number).columns.tolist()
skewness = X[numeriche].skew().sort_values(ascending=False)
log_cols = skewness[skewness > 1].index.tolist()

print(f"\n📈 Variabili trasformate con log1p (skew > 1): {log_cols}")
X[log_cols] = X[log_cols].apply(np.log1p)

# Salva log-colonne trasformate per futura inference
joblib.dump(log_cols, os.path.join(model_dir,f"log_transformed_features_{target}.pkl"))

# ------------------ 🎯 Target (solo log su reduced)
if target == "reduced":
    y = np.log1p(df[target])
else:
    y = df[target]

# ------------------ 🔣 Categorical encoding
cat_cols = ["fascia_oraria", "giorno_settimana", "stagione", "festività",'genres', 'keywords', 'cast', 'director']
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# ------------------ 🔀 Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print("✔️ Preprocessing completato. Pronto per il modello!")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


📈 Variabili trasformate con log1p (skew > 1): ['popularity', 'precip_mm']
✔️ Preprocessing completato. Pronto per il modello!
Train shape: (482, 972), Test shape: (121, 972)


In [27]:
# ------------------ 📦 Modelli con RFE ------------------
modelli = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting (Tuned)": GradientBoostingRegressor(random_state=42),
    "CatBoost (Tuned)": CatBoostRegressor(silent=True, random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "Ridge Regression": Ridge(),
    "SVR": make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100, gamma='scale', epsilon=0.1))
}

results = []

In [28]:
print(X_train.info())
# or to see only the object columns:
print(X_train.select_dtypes(include=['object']).columns)

<class 'pandas.core.frame.DataFrame'>
Index: 482 entries, 9 to 102
Columns: 972 entries, mese to director_İlker Çatak
dtypes: bool(963), float64(5), int64(4)
memory usage: 490.9 KB
None
Index([], dtype='object')


In [29]:
for nome, modello in modelli.items():
    print(f"\n🔍 Selezione feature per: {nome}")

    if nome == "SVR":
        rfe = RFE(estimator=LinearSVR(max_iter=10000), n_features_to_select=8)
    else:
        rfe = RFE(estimator=modello, n_features_to_select=8)

    rfe.fit(X_train, y_train)
    selected_features = X_train.columns[rfe.support_].tolist()
    print(f"➡️ Features selezionate: {selected_features}")

    X_train_sel = X_train[selected_features]
    X_test_sel = X_test[selected_features]

    # ------------------ 🔧 Tuning + Salvataggio ------------------
    filename_base = nome.lower().replace(" ", "_").replace("(", "").replace(")", "")

    if nome == "SVR":
        modello.fit(X_train_sel, y_train)
        pred = modello.predict(X_test_sel)


    elif "CatBoost" in nome:
        param_grid = {
            'iterations': [200, 300, 500],
            'learning_rate': [0.01, 0.05, 0.1],
            'depth': [4, 6, 8],
            'l2_leaf_reg': [1, 3, 5, 7],
            'bagging_temperature': [0.0, 0.2, 0.5, 1.0]
        }
        search = RandomizedSearchCV(modello, param_grid, n_iter=20, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, random_state=42)
        search.fit(X_train_sel, y_train)
        best_model = search.best_estimator_
        pred = best_model.predict(X_test_sel)

    elif "Gradient Boosting" in nome:
        param_grid = {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4, 5]
        }
        search = RandomizedSearchCV(modello, param_grid, n_iter=10, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, random_state=42)
        search.fit(X_train_sel, y_train)
        best_model = search.best_estimator_
        pred = best_model.predict(X_test_sel)


    elif "XGBoost" in nome:
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4, 5],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        search = RandomizedSearchCV(modello, param_grid, n_iter=10, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, random_state=42)
        search.fit(X_train_sel, y_train)
        best_model = search.best_estimator_
        pred = best_model.predict(X_test_sel)


    else:
        modello.fit(X_train_sel, y_train)
        pred = modello.predict(X_test_sel)


    results.append({
        "Modello": nome,
        "MAE": mean_absolute_error(y_test, pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, pred)),
        "R2": r2_score(y_test, pred),
        "Features": selected_features,
        "ModelObject": best_model if 'best_model' in locals() else modello,
        "FilenameBase": filename_base + f"_{target}"
    })


🔍 Selezione feature per: Linear Regression
➡️ Features selezionate: ["genres_['Comedy', 'Drama', 'Family']", "keywords_['moon', 'sequel', 'based on video game', 'psychotic', 'dual role', 'aftercreditsstinger', 'duringcreditsstinger', 'hedgehog', 'live action and animation', 'grandfather grandson relationship', 'animal human friendship', 'anthropomorphic animal', 'loss and grief', 'cliché', 'complicated', 'sceptical']", "cast_['Luca Marinelli', 'Alessandro Borghi', 'Lupo Barbiero']", 'director_David G. Derrick Jr.', 'director_Jon Favreau', 'director_Kelsey Mann', 'director_Maura Delpero', 'director_Paola Cortellesi']

🔍 Selezione feature per: Random Forest
➡️ Features selezionate: ['mese', 'anno', 'temp_max', 'temp_min', 'precip_mm', 'vote_average', 'popularity', "genres_['Drama', 'Comedy', 'History']"]

🔍 Selezione feature per: Gradient Boosting (Tuned)
➡️ Features selezionate: ['anno', 'temp_max', 'temp_min', 'vote_average', 'popularity', "keywords_['sadness', 'disgust', 'sequel', 'c

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 629, in input_data
    self.proxy.set_info(
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 975, in set_info
    self.feature_names = feature_names
  File "c:\Users\mfab9\.conda\envs\cinema-env\lib\site-packages\xgboost\core.py", line 1364, in feature_names
    raise ValueError(
ValueError: feature_names must be string, and may not contain [, ] or <


In [32]:
# ------------------ 📋 Riepilogo finale e salvataggio top 3 ------------------
results_df = pd.DataFrame(results)
top3 = results_df.sort_values(by="R2", ascending=False).head(3)

print(f"\n📋 Riepilogo finale dei top 3 modelli per previsione '{target}':")
print(top3[["Modello", "MAE", "RMSE", "R2", "Features"]].to_string(index=False))



📋 Riepilogo finale dei top 3 modelli per previsione 'total':
                  Modello       MAE      RMSE       R2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Features
         CatBoost (Tuned) 32.124847 43.025852 0.309929                                                                                                                                                                                                                                                                                                                                                  