In [12]:
import pandas as pd

# Carica il dataset
data = pd.read_csv('C:\\Users\\CRAIA-AREA EDUCATORI\\Documents\\Università\\Machine Learning\\beer_data\\beer_reviews.csv')


In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score


In [14]:
# Creazione della colonna categoriale basata su review_overall
def categorize_overall(score):
    if score <= 2.5:
        return 'Basso'
    elif score <= 4:
        return 'Medio'
    else:
        return 'Alto'

data['overall_category'] = data['review_overall'].apply(categorize_overall)

# Controlliamo la distribuzione delle classi
print(data['overall_category'].value_counts())


overall_category
Medio    1050225
Alto      415705
Basso     120684
Name: count, dtype: int64


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Definiamo X (feature) e y (target)
X = data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']]  # Le feature numeriche
y = data['overall_category']

# Convertiamo il target in valori numerici
le = LabelEncoder()
y = le.fit_transform(y)

# Suddividiamo in training e test set (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Controlliamo la distribuzione delle classi nel training set
print("Distribuzione delle classi nel training set:", np.bincount(y_train))
print("Distribuzione delle classi nel test set:", np.bincount(y_test))


Distribuzione delle classi nel training set: [333017  96648 839626]
Distribuzione delle classi nel test set: [ 82688  24036 210599]


In [16]:
from sklearn.preprocessing import StandardScaler

# Applichiamo lo standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Addestramento del modello
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Predizioni sul test set
y_pred = logistic_model.predict(X_test)

# Valutazione del modello
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        Alto       0.70      0.53      0.61     82688
       Basso       0.78      0.62      0.69     24036
       Medio       0.80      0.89      0.84    210599

    accuracy                           0.78    317323
   macro avg       0.76      0.68      0.71    317323
weighted avg       0.77      0.78      0.77    317323

Confusion Matrix:
[[ 44210    121  38357]
 [    77  14833   9126]
 [ 18612   3956 188031]]


In [18]:
# Logistic Regression con class weights
logistic_weighted = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
logistic_weighted.fit(X_train, y_train)

# Predizioni sul test set
y_pred_weighted = logistic_weighted.predict(X_test)

# Valutazione
print("Classification Report (Weighted Logistic Regression):")
print(classification_report(y_test, y_pred_weighted, target_names=le.classes_))

print("Confusion Matrix (Weighted Logistic Regression):")
print(confusion_matrix(y_test, y_pred_weighted))


Classification Report (Weighted Logistic Regression):
              precision    recall  f1-score   support

        Alto       0.58      0.74      0.65     82688
       Basso       0.46      0.89      0.60     24036
       Medio       0.86      0.67      0.75    210599

    accuracy                           0.71    317323
   macro avg       0.63      0.77      0.67    317323
weighted avg       0.76      0.71      0.72    317323

Confusion Matrix (Weighted Logistic Regression):
[[ 61182    520  20986]
 [   170  21292   2574]
 [ 43866  24908 141825]]


In [19]:
from sklearn.ensemble import RandomForestClassifier

# Creazione e addestramento del modello Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predizioni sul test set
y_pred_rf = rf_model.predict(X_test)

# Valutazione
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))


Classification Report (Random Forest):
              precision    recall  f1-score   support

        Alto       0.70      0.54      0.61     82688
       Basso       0.77      0.64      0.70     24036
       Medio       0.80      0.89      0.84    210599

    accuracy                           0.78    317323
   macro avg       0.76      0.69      0.72    317323
weighted avg       0.77      0.78      0.77    317323

Confusion Matrix (Random Forest):
[[ 44533    141  38014]
 [    89  15290   8657]
 [ 18799   4452 187348]]


In [20]:
# Random Forest con bilanciamento
rf_balanced = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_balanced.fit(X_train, y_train)

# Predizioni
y_pred_rf_balanced = rf_balanced.predict(X_test)

# Valutazione
print("Classification Report (Random Forest Balanced):")
print(classification_report(y_test, y_pred_rf_balanced, target_names=le.classes_))

print("Confusion Matrix (Random Forest Balanced):")
print(confusion_matrix(y_test, y_pred_rf_balanced))


Classification Report (Random Forest Balanced):
              precision    recall  f1-score   support

        Alto       0.56      0.78      0.65     82688
       Basso       0.50      0.87      0.64     24036
       Medio       0.87      0.66      0.75    210599

    accuracy                           0.71    317323
   macro avg       0.64      0.77      0.68    317323
weighted avg       0.76      0.71      0.72    317323

Confusion Matrix (Random Forest Balanced):
[[ 64572    581  17535]
 [   214  20927   2895]
 [ 51512  19967 139120]]


In [22]:
ufrom sklearn.model_selection import GridSearchCV

# Definizione della griglia di iperparametri
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

# Creazione del modello Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Grid Search con validazione incrociata
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Avvio della ricerca
grid_search.fit(X_train, y_train)

# Migliori parametri trovati
print("Migliori parametri trovati:")
print(grid_search.best_params_)

# Miglior modello
best_rf = grid_search.best_estimator_

# Valutazione sul test set
y_pred_best_rf = best_rf.predict(X_test)

# Report di classificazione
print("Classification Report (Best Random Forest):")
print(classification_report(y_test, y_pred_best_rf, target_names=le.classes_))

print("Confusion Matrix (Best Random Forest):")
print(confusion_matrix(y_test, y_pred_best_rf))


Fitting 3 folds for each of 81 candidates, totalling 243 fits


KeyboardInterrupt: 