In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

sns.set_style('whitegrid')

In [15]:
# funzione che si occupa del preprocessing dei dati
def prepare_data(df, target_col, drop_cols=[],
               dummies_cols=[], labels_cols=[],
               standardize_cols=[], log_standardize_cols=[]):
    X = df.drop(columns=[target_col] + drop_cols, axis=1)
    y = df[target_col]

    if dummies_cols:
        X = pd.get_dummies(X, columns=dummies_cols)
    if labels_cols:
        encoder = LabelEncoder()
        for col in labels_cols:
            X[col] = encoder.fit_transform(X[col])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

    if standardize_cols:
        scaler = StandardScaler()
        X_train[standardize_cols] = scaler.fit_transform(X_train[standardize_cols])
        X_test[standardize_cols] = scaler.transform(X_test[standardize_cols])
    if log_standardize_cols:
        scaler = StandardScaler()
        X_train[log_standardize_cols] = scaler.fit_transform(np.log(X_train[log_standardize_cols] + 1))
        X_test[log_standardize_cols] = scaler.transform(np.log(X_test[log_standardize_cols] + 1))

    return X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

In [16]:
# funzione che visualizza in un grafico i risultati della cross validation
def plot_cv_results(param_range, train_scores, test_scores,
                    xlabel, ylabel, title=''):
    plt.figure(figsize=(10, 6))
    plt.plot(param_range, train_scores, label='Train score')
    plt.plot(param_range, test_scores, label='Validation score')
    plt.legend()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

In [17]:
# funzione che implementa una pipeline completa per il tuning dei modelli
def tune_model(model, model_name, X, y,
               grid_params={}, grid_scoring=[], cv=5,
               cv_params={}, cv_scoring=[], is_regression=True,
               verbose=True, plot=True, xlabel='', ylabel=''):
    best_model = model
    if grid_params:
        grid_search = GridSearchCV(model, grid_params, scoring=grid_scoring, cv=cv)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_

        if verbose:
            print(f'Results for {model_name}:')
            print(f'Best parameters: {grid_search.best_params_}')
            print(f'Best score: {grid_search.best_score_}')
    
    if cv_params:
        for param in cv_params:
            train_scores, test_scores = [], []
            param_range = cv_params[param]

            cv_model = best_model

            for val in param_range:
                cv_model.set_params(**{param: val})
                cv_results = cross_validate(cv_model, X, y, scoring=cv_scoring, cv=cv, return_train_score=True)

                train_score = -cv_results['train_score'].mean() if is_regression else cv_results['train_score'].mean()
                test_score = -cv_results['test_score'].mean() if is_regression else cv_results['test_score'].mean()

                train_scores.append(train_score)
                test_scores.append(test_score)

            if plot:
                plot_cv_results(param_range, train_scores, test_scores, xlabel, ylabel, title=f'{model_name} - {param}')

## Task di Regressione sullo 'score'

### Addestramento dei modelli con la prima versione dei dataset

In [20]:
# leggi il dataset movies_final.csv
df = pd.read_csv('../dataset/movies_final.csv')

# prepara i dati per l'addestramento e tuning del modello
X_train, X_test, y_train, y_test = prepare_data(df, 'score', drop_cols=['id', 'title', 'director', 'star', 'year', 'name_x', 'name_y'],
                                                dummies_cols=['genre', 'rating'],
                                                labels_cols=['country', 'company'],
                                                standardize_cols=['success_index', 'cult_index', 'profit_index', 'runtime', 'age'],
                                                log_standardize_cols=['budget', 'gross', 'votes'])


In [21]:
model = LinearRegression()

grid = {
    'fit_intercept': [True, False]
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

tune_model(model, 'Linear Regression', X_train, y_train,
           grid_params=grid, grid_scoring=mse_scorer,
           cv=5, is_regression=True, verbose=True)

Results for Linear Regression:
Best parameters: {'fit_intercept': True}
Best score: -0.017686562208212967
