# Imports

In [None]:
import string

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, OneHotEncoder, StandardScaler, MaxAbsScaler, Normalizer, OrdinalEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer

In [None]:
from scipy.stats import randint, uniform

# Functions

In [None]:
def inspect_df(df : pd.DataFrame):
    print(f'Rows:       {df.shape[0]}')
    print(f'Columns:    {df.shape[1]}')
    print("-" * 30)

    null_values = df.isna().sum()
    if (null_values.sum() == 0):
        print('Null values: 0')
    else:
        print('Null values:')
        print(null_values)
    
    print("-" * 30)
    print('Feature data types:')
    print(df.dtypes)
    print("")

In [None]:
def check_balance(df : pd.DataFrame, target_col : str):
    value_counts = (df[target_col].value_counts(normalize=True) * 100).round(2)
    ideal_share = 100 / len(value_counts)
    
    print(f'Value counts for {target_col}:')
    print(value_counts)
    print("-" * 30)
    print(f'The dataset is balanced if the value counts are close to {ideal_share:.2f}%')

In [None]:
def plot_scatter(df : pd.DataFrame, x_feature : str, y_feature : str):
    plt.scatter(df[x_feature], df[y_feature])
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()

In [None]:
def evaluate_classifier(model, x_test, y_test, title : str, f1_average : str='binary', f1_pos_label=1):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average=f1_average, pos_label=f1_pos_label)

    print("")
    print(f"{title.title()}:")
    print(f"Accuracy:                   {accuracy:.4f}")
    print(f"F1 score ({f1_average}):        {f1:.4f}")

    disp = ConfusionMatrixDisplay.from_predictions(
        y_pred, y_test,
        cmap=plt.cm.Blues, 
        normalize='true'    # 'true' normalize on rows
    )
    disp.ax_.set_title(title.title())
    plt.show()

    return y_pred

In [None]:
def evaluate_regressor(model, x_test, y_test, binary=True):
    y_pred = model.predict(x_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    if (y_test > -1).all() & (y_pred > -1).all():
        msle = mean_squared_log_error(y_test, y_pred)
    else:
        msle = "N/A (y_test contiene valori <= -1)"
    r2 = r2_score(y_test, y_pred)

    print("")
    print(f"MAE:    {mae}")
    print(f"MSE:    {mse}")
    print(f"MSLE:   {msle}")
    print(f"R2:     {r2}")
    print("")
    print("Remember: R2 score is in [-inf, 1]. R2<0 --> BAD MODEL.")

    return y_pred

In [None]:
def get_categorical_features(df : pd.DataFrame):
    return df.select_dtypes(include=['object']).columns

In [None]:
def get_k_correlated(df : pd.DataFrame, y_name, k=5):
    y_corr = df.corr()[y_name].abs().sort_values(ascending=False)
    return y_corr[1:(k+1)].index.tolist()

In [None]:
def pfi(model, x_test, y_test):
    pfi = permutation_importance(model, x_test, y_test, n_jobs=-1)
    fig, ax = plt.subplots()
    pd.Series(pfi['importances_mean']).plot.bar(yerr=pfi['importances_std'], ax=ax)
    ax.set_title("Permutation Feature Importance")
    ax.set_ylabel('Importance')
    fig.tight_layout()
    plt.show()

In [None]:
def clean_text(text : str):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text

# Exam

# Utils

In [None]:
"""
------ Cross-Validation
cv = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"\nMedia Accuracy 5-Fold CV: {cv.mean():.4f}")
#cv è il numero di fold, accuracy la metrica da valutare.
#Confronto: La CV è solitamente più rappresentativa della Confusion Matrix perché testa il modello su diverse porzioni del dataset, riducendo il rischio di overfitting su uno specifico split di test.


------ Cross-Validation e Pipeline
# 1. Creiamo la pipeline: prima scala i dati, poi applica la LR
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('log_reg', log_reg) # log_reg è il modello definito in precedenza
])
# 2. Passo la pipeline alla cross_val_score invece del singolo modello
cv = cross_val_score(pipeline, X, y, cv=10, scoring='accuracy')
print(f"\nMedia Accuracy 10-Fold CV (con Scaling): {cv.mean():.4f}")


------ Train-Test split
X = dataset.drop('target', axis=1).drop(get_categorical_features(dataset), axis=1)
y = dataset['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/5, random_state=0
)


------ Scaler
# Dovresti preoccuparti dello scaling (e quindi del MaxAbsScaler o StandardScaler) solo quando usi: Logistic Regression (per far convergere il solutore); SVM / KNN (perché si basano sulle distanze); Reti Neurali (per la stabilità del gradiente); PCA (perché si basa sulla varianza).
scaler = StandardScaler()
#oppure
# scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


------ Allenare Logistic Regression
log_reg = LogisticRegression(max_iter=1000,random_state=42)
log_reg.fit(X_train, y_train)


------- Allenare Decision Tree
dec_tree = DecisionTreeClassifier(random_state=42)  
dec_tree.fit(X_train, y_train)


------- Allenare Random Forest
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(X_train, y_train)


------- Allenare Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


------- Grid Search CV
# 1. Definizione dei parametri da testare
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10, 20]
}
# 2. Creazione del classificatore base
dt_base = DecisionTreeClassifier(random_state=0)
# 3. Configurazione della GridSearchCV (cv=10 per coerenza con il punto precedente)
grid_search = GridSearchCV(estimator=dt_base, param_grid=param_grid, cv=10, scoring='accuracy')
# 4. Ricerca sui dati di train
grid_search.fit(X_train, y_train)
# 5. Risultati migliori:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Migliori parametri trovati: {best_params}")
print(f"Accuracy media in CV (Migliore): {best_score:.4f}")

"""