In [1]:
from supervised_utils import *
from tabulate import tabulate

In [2]:
# sopprimi i warning a runtime
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def print_info(df):
    """
    Stampa il tipo di dato di ogni colonna.
    """
    info = [(col, str(type)) for col, type in df.dtypes.items()]
    print(tabulate(info, headers=["Column", "Type"], tablefmt="pretty"))
    print(f"# rows: {len(df)}")

### Task di regressione su `score`

#### Sul dataset iniziale `movies_adj.csv`

In [4]:
df = pd.read_csv("../data/movies_adj.csv")
print_info(df)

+----------+---------+
|  Column  |  Type   |
+----------+---------+
|    id    |  int64  |
|  title   | object  |
|  rating  | object  |
|  genre   | object  |
|   year   |  int64  |
|  score   | float64 |
|  votes   | float64 |
| director | object  |
|   star   | object  |
| country  | object  |
|  budget  | float64 |
|  gross   | float64 |
| company  | object  |
| runtime  | float64 |
+----------+---------+


In [5]:
# colonne per il training
cols = {
    "target": "score",
    "drop": ["id", "title", "company", "country", "director", "star"],
    "dummies": ["rating", "genre"],
    "labels": [],
    "standardize": ["runtime"],
    "minmax": ["votes", "budget", "gross"]
}

In [None]:
# tuning e testing
tune_and_test_models(df, cols, task="regression", session_name="raw")

#### Sul dataset derivato `movies_features.csv`

In [23]:
df = pd.read_csv("../data/movies_features.csv")
print_info(df)

+----------------------------+---------+
|           Column           |  Type   |
+----------------------------+---------+
|             id             |  int64  |
|           title            | object  |
|           rating           | object  |
|           genre            | object  |
|          runtime           | object  |
|            age             |  int64  |
|         popularity         | object  |
|           score            | float64 |
|        is_acclaimed        |  bool   |
|         is_panned          |  bool   |
|     budget_efficiency      | object  |
|       is_blockbuster       |  bool   |
|          is_indie          |  bool   |
|   director_age_in_movie    | float64 |
|    director_experience     |  int64  |
|   director_is_acclaimed    |  bool   |
|     director_is_panned     |  bool   |
| director_budget_efficiency | float64 |
|     star_age_in_movie      | float64 |
|      star_experience       |  int64  |
|     star_is_acclaimed      |  bool   |
|       star_is_

In [24]:
# colonne per il training
cols = {
    "target": "score",
    "drop": ["id", "title", "is_acclaimed", "is_panned"],
    "dummies": ["rating", "genre", "runtime", "popularity", "budget_efficiency"],
    "labels": [],
    "standardize": ["director_age_in_movie", "star_age_in_movie"],
    "minmax": ["director_experience", "star_experience"]
}

In [None]:
# tuning e testing
tune_and_test_models(df, cols, task="regression", session_name="der")

### Task di classificazione su `budget_efficiency`

#### Sul dataset iniziale `movies_adj.csv`

In [9]:
adj_df = pd.read_csv("../data/movies_adj.csv")
features_df = pd.read_csv("../data/movies_features.csv")
df = adj_df.merge(features_df[["id", "budget_efficiency"]], on="id", how="inner")

df = df[list(adj_df.columns) + ["budget_efficiency"]]
print_info(df)

+-------------------+---------+
|      Column       |  Type   |
+-------------------+---------+
|        id         |  int64  |
|       title       | object  |
|      rating       | object  |
|       genre       | object  |
|       year        |  int64  |
|       score       | float64 |
|       votes       | float64 |
|     director      | object  |
|       star        | object  |
|      country      | object  |
|      budget       | float64 |
|       gross       | float64 |
|      company      | object  |
|      runtime      | float64 |
| budget_efficiency | object  |
+-------------------+---------+


In [10]:
# colonne per il training
cols = {
    "target": "budget_efficiency",
    "drop": ["id", "title", "company", "country", "director", "star", "gross"],
    "dummies": ["rating", "genre"],
    "labels": [],
    "standardize": ["runtime", "score"],
    "minmax": ["votes", "budget"]
}

In [None]:
# tuning e testing
tune_and_test_models(df, cols, task="classification", resample=False, session_name="raw")

#### Sul dataset derivato `movies_features.csv`

In [25]:
df = pd.read_csv("../data/movies_features.csv")
print_info(df)

+----------------------------+---------+
|           Column           |  Type   |
+----------------------------+---------+
|             id             |  int64  |
|           title            | object  |
|           rating           | object  |
|           genre            | object  |
|          runtime           | object  |
|            age             |  int64  |
|         popularity         | object  |
|           score            | float64 |
|        is_acclaimed        |  bool   |
|         is_panned          |  bool   |
|     budget_efficiency      | object  |
|       is_blockbuster       |  bool   |
|          is_indie          |  bool   |
|   director_age_in_movie    | float64 |
|    director_experience     |  int64  |
|   director_is_acclaimed    |  bool   |
|     director_is_panned     |  bool   |
| director_budget_efficiency | float64 |
|     star_age_in_movie      | float64 |
|      star_experience       |  int64  |
|     star_is_acclaimed      |  bool   |
|       star_is_

In [26]:
# colonne per il training
cols = {
    "target": "budget_efficiency",
    "drop": ["id", "title"],
    "dummies": ["rating", "genre", "runtime", "popularity"],
    "labels": [],
    "standardize": ["score", "director_age_in_movie", "star_age_in_movie"],
    "minmax": ["director_experience", "star_experience"]
}

In [None]:
# tuning e testing
tune_and_test_models(df, cols, task="classification", resample=False, session_name="der")