In [2]:
# @title
import pandas as pd
import numpy as np
import statistics  as stat
import matplotlib.pyplot as plt
from collections import Counter
import requests
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from google.colab import drive
import tensorflow as tf
from datetime import datetime
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
import itertools
import ast
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ReduceLROnPlateau
#from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title
path=""
df = pd.read_csv(path,sep=";",encoding='unicode_escape')
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y', errors='coerce')  # convert date
df['weekday'] = df['date'].dt.weekday + 1  # 1=Monday, 7=Sunday
df['month'] = df['date'].dt.month  # extract month
df = df.dropna(subset=['time'])  # drop rows with missing time
df['hour'] = pd.to_datetime(df['time'], format='%H:%M', errors='coerce')  # convert time
df['time_slot'] = df['hour'].dt.hour.apply(lambda h: '8' if 6 <= h < 12 else '17' if 12 <= h < 18 else '20')  # assign slot
df = df[~df.isin(['-']).any(axis=1)]  # remove rows with "-"
df['full_price'] = pd.to_numeric(df['full_price'], errors='coerce')  # convert to numeric
df['reduced'] = pd.to_numeric(df['reduced'], errors='coerce')
df['free'] = pd.to_numeric(df['free'], errors='coerce')
df['total'] = df['full_price'] + df['reduced'] + df['free']  # recompute total
df['date'] = df['date'].dt.strftime('%Y-%m-%d')  # format date as string
df=df.drop(["hour"],axis=1)
df = df.sort_values(by="date")  # sort chronologically
df.head()

#API CALL

In [None]:
# @title
# --- TMDb Data Enrichment: genres + metadata ---
import requests, pandas as pd
from tqdm import tqdm

def get_genre_ids(df):
    api_key = #key
    genres, languages, popularity, rating, ids = [], [], [], [], []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        title = row["title"]
        if title.endswith(" - Piacere del Cinema"): title = title[:-21]
        url = "https://api.themoviedb.org/3/search/movie"
        params = {"api_key": api_key, "query": title, "language": "it-IT"}
        r = requests.get(url, params=params)
        if r.status_code == 200 and r.json()["results"]:
            m = r.json()["results"][0]
            genres.append(m.get("genre_ids", []))
            languages.append(m.get("original_language", ""))
            popularity.append(m.get("popularity", 0.0))
            rating.append(m.get("vote_average", 0.0))
            ids.append(m.get("id", None))
        else:
            genres.append([]); languages.append([]); popularity.append([]); rating.append([]); ids.append([])
    df["genre_ids"] = genres
    df["original_language"] = languages
    df["popularity"] = popularity
    df["rating"] = rating
    df["id"] = ids
    df = df[df["genre_ids"].apply(lambda x: len(x) > 0)]
    df = df[df["original_language"].apply(lambda x: len(x) > 0)]
    return df

def get_movie_details(df):
    api_key = #key
    cast_list, director_list, budget_list, production_list, keyword_list, collection_list = [], [], [], [], [], []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        tmdb_id = row["id"]
        # --- Credits: top-3 cast + director ---
        r_credits = requests.get(
            f"https://api.themoviedb.org/3/movie/{tmdb_id}/credits",
            params={"api_key": api_key, "language": "it-IT"}
        )
        if r_credits.status_code == 200:
            d = r_credits.json()
            top3 = [a["name"] for a in d.get("cast", [])[:3]]
            top3 += [None] * (3 - len(top3))          # pad to 3
            cast_list.append(top3)
            directors = [m["name"] for m in d.get("crew", []) if m["job"] == "Director"]
            director_list.append(directors[0] if directors else None)
        else:
            cast_list.append([None, None, None])
            director_list.append(None)

        # --- Movie details: budget, production, collection ---
        r_details = requests.get(
            f"https://api.themoviedb.org/3/movie/{tmdb_id}",
            params={"api_key": api_key, "language": "it-IT"}
        )
        if r_details.status_code == 200:
            d = r_details.json()
            budget_list.append(d.get("budget"))
            prod = d.get("production_companies", [])
            production_list.append(prod[0]["name"] if prod else None)
            collection_list.append(d.get("belongs_to_collection"))
        else:
            budget_list.append(None)
            production_list.append(None)
            collection_list.append(None)

        # --- Keywords ---
        r_keywords = requests.get(
            f"https://api.themoviedb.org/3/movie/{tmdb_id}/keywords",
            params={"api_key": api_key}
        )
        if r_keywords.status_code == 200:
            k = r_keywords.json()
            keyword_list.append([kw["name"] for kw in k.get("keywords", [])])
        else:
            keyword_list.append([])

    # Aggiungo le colonne al DataFrame
    df["cast"] = cast_list
    df["director"] = director_list
    df["budget"] = budget_list
    df["production_company"] = production_list
    df["keywords"] = keyword_list
    df["belongs_to_collection"] = collection_list
    return df

def extract_collection_name(row): return "No Collection" if pd.isna(row) else row.get("name", "Unknown")
def belongs_to_collection_binary(row): return not pd.isna(row)

# USAGE
selected_cols = ["title", "month", "total", "weekday", "time_slot"]
df_titles = df[selected_cols].copy()
df_tmdb = get_genre_ids(df_titles)
df_tmdb = get_movie_details(df_tmdb)
df_tmdb["collection_name"] = df_tmdb["belongs_to_collection"].apply(extract_collection_name)
df_tmdb["in_collection"] = df_tmdb["belongs_to_collection"].apply(belongs_to_collection_binary)
df_tmdb.to_csv(path, index=False)
df_tmdb.head()

#FEATURE ENGINEERING

In [None]:
# feature engineering
df=pd.read_csv(path, sep=",")

# cyclic encoding per month e weekday
df['month_sin']   = np.sin(2*np.pi * (df['month']-1) / 12)
df['month_cos']   = np.cos(2*np.pi * (df['month']-1) / 12)
df['wd_sin']      = np.sin(2*np.pi * (df['weekday']-1) / 7)
df['wd_cos']      = np.cos(2*np.pi * (df['weekday']-1) / 7)

# one-hot per time_slot (se hai solo 3 slot: '8','17','20')
df = pd.get_dummies(df, columns=['time_slot'], prefix='ts')

#numerical features
num_cols = ['popularity','rating','budget']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [32]:
#keep the most frequent
for col in ['director','production_company']: #original_language
    top = df[col].value_counts().nlargest(10).index
    df[col] = df[col].where(df[col].isin(top), other='Other')
df = pd.get_dummies(df, columns=['original_language','director','production_company'], prefix=['lang','dir','prod'])

In [None]:
from collections import Counter
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# 1) Determine top labels
all_genres = [g for lst in df['genre_ids'] for g in lst]
top_genres = [g for g,_ in Counter(all_genres).most_common(15)]
all_cast = [c for lst in df['cast'] for c in lst]
top_cast = [c for c,_ in Counter(all_cast).most_common(40)]
#all_kw = [k for lst in df['keywords'] for k in lst]
#top_kw = [k for k,_ in Counter(all_kw).most_common(50)]

# 2) Filter lists, add 'Other' for any out-of-top labels
def filter_top(lst, top_list):
    filtered = [x for x in lst if x in top_list]
    if any(x not in top_list for x in lst):
        filtered.append('Other')
    return filtered

df['genre_ids'] = df['genre_ids'].apply(lambda lst: filter_top(lst, top_genres))
df['cast']      = df['cast'].apply(lambda lst: filter_top(lst, top_cast))
#df['keywords']  = df['keywords'].apply(lambda lst: filter_top(lst, top_kw))

# 3) Multi-hot encode filtered lists
mlb_genre = MultiLabelBinarizer()
mlb_cast  = MultiLabelBinarizer()
#mlb_kw    = MultiLabelBinarizer()

genre_df = pd.DataFrame(
    mlb_genre.fit_transform(df['genre_ids']),
    columns=[f"genre_{g}" for g in mlb_genre.classes_],
    index=df.index
)
cast_df = pd.DataFrame(
    mlb_cast.fit_transform(df['cast']),
    columns=[f"cast_{c}" for c in mlb_cast.classes_],
    index=df.index
)
'''
kw_df = pd.DataFrame(
    mlb_kw.fit_transform(df['keywords']),
    columns=[f"kw_{k}" for k in mlb_kw.classes_],
    index=df.index
)
'''

# 4) Concatenate and drop originals
df = pd.concat([df, genre_df, cast_df], axis=1)
df = df.drop(columns=['genre_ids','cast'])

df.head(2)

In [None]:
print(df.dtypes.value_counts())
prefixes = ['genre_','cast_','ts_','lang_','dir_','prod_','coll_']
for p in prefixes:
    n = sum(1 for c in df.columns if c.startswith(p))
    print(f"{p:<8} → {n} columns")
print(f"Total columns: {df.shape[1]}")

In [36]:
df=df.to_csv("/content/drive/MyDrive/Colab Notebooks/csv/df_coded.csv", index=False)