In [2]:
import pandas as pd

# Carica il dataset
data = pd.read_csv('/home/sagemaker-user/beer_reviews.csv')

In [3]:
import numpy as np

In [4]:
# Creazione della colonna categoriale basata su review_overall
def categorize_overall(score):
    if score <= 2.5:
        return 'Basso'
    elif score <= 4:
        return 'Medio'
    else:
        return 'Alto'

data['overall_category'] = data['review_overall'].apply(categorize_overall)

# Controlliamo la distribuzione delle classi
print(data['overall_category'].value_counts())

overall_category
Medio    1050225
Alto      415705
Basso     120684
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Definiamo X (feature) e y (target)
X = data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']]  # Le feature numeriche
y = data['overall_category']

# Convertiamo il target in valori numerici
le = LabelEncoder()
y = le.fit_transform(y)

# Suddividiamo in training e test set (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Controlliamo la distribuzione delle classi nel training set
print("Distribuzione delle classi nel training set:", np.bincount(y_train))
print("Distribuzione delle classi nel test set:", np.bincount(y_test))



Distribuzione delle classi nel training set: [333017  96648 839626]
Distribuzione delle classi nel test set: [ 82688  24036 210599]


In [6]:
from sklearn.preprocessing import StandardScaler

# Applichiamo lo standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Addestramento del modello
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Predizioni sul test set
y_pred = logistic_model.predict(X_test)

# Valutazione del modello
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

        Alto       0.70      0.53      0.61     82688
       Basso       0.78      0.62      0.69     24036
       Medio       0.80      0.89      0.84    210599

    accuracy                           0.78    317323
   macro avg       0.76      0.68      0.71    317323
weighted avg       0.77      0.78      0.77    317323

Confusion Matrix:
[[ 44210    121  38357]
 [    77  14833   9126]
 [ 18612   3956 188031]]


In [8]:
# Logistic Regression con class weights
logistic_weighted = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
logistic_weighted.fit(X_train, y_train)

# Predizioni sul test set
y_pred_weighted = logistic_weighted.predict(X_test)

# Valutazione
print("Classification Report (Weighted Logistic Regression):")
print(classification_report(y_test, y_pred_weighted, target_names=le.classes_))

print("Confusion Matrix (Weighted Logistic Regression):")
print(confusion_matrix(y_test, y_pred_weighted))


Classification Report (Weighted Logistic Regression):
              precision    recall  f1-score   support

        Alto       0.58      0.74      0.65     82688
       Basso       0.46      0.89      0.60     24036
       Medio       0.86      0.67      0.75    210599

    accuracy                           0.71    317323
   macro avg       0.63      0.77      0.67    317323
weighted avg       0.76      0.71      0.72    317323

Confusion Matrix (Weighted Logistic Regression):
[[ 61182    520  20986]
 [   170  21292   2574]
 [ 43866  24908 141825]]


In [9]:
from sklearn.ensemble import RandomForestClassifier

# Creazione e addestramento del modello Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predizioni sul test set
y_pred_rf = rf_model.predict(X_test)

# Valutazione
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))


Classification Report (Random Forest):
              precision    recall  f1-score   support

        Alto       0.70      0.54      0.61     82688
       Basso       0.77      0.64      0.70     24036
       Medio       0.80      0.89      0.84    210599

    accuracy                           0.78    317323
   macro avg       0.76      0.69      0.72    317323
weighted avg       0.77      0.78      0.77    317323

Confusion Matrix (Random Forest):
[[ 44533    141  38014]
 [    89  15290   8657]
 [ 18799   4452 187348]]


In [10]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definizione della griglia ridotta
param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=30,  # Numero di combinazioni casuali da testare
    scoring='f1_macro',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Avvio della ricerca
random_search.fit(X_train, y_train)

# Migliori parametri trovati
print("Migliori parametri trovati:")
print(random_search.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Migliori parametri trovati:
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10, 'class_weight': 'balanced'}
[CV] END class_weight=balanced, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.7min
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.6min
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 3.1min
[CV] END class_weight=balanced, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 3.8min
[CV] END class_weight=balanced, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 3.8min
[CV] END class_weight=balanced, max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total tim

In [10]:
# Addestramento del modello finale
best_rf = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)
best_rf.fit(X_train, y_train)

# Valutazione sul test set
y_pred_test = best_rf.predict(X_test)

# Metriche di valutazione
from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report (Miglior RF):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Miglior RF):")
print(confusion_matrix(y_test, y_pred_test))


Classification Report (Miglior RF):
              precision    recall  f1-score   support

           0       0.55      0.78      0.65     82688
           1       0.52      0.87      0.65     24036
           2       0.87      0.66      0.75    210599

    accuracy                           0.71    317323
   macro avg       0.65      0.77      0.68    317323
weighted avg       0.76      0.71      0.72    317323


Confusion Matrix (Miglior RF):
[[ 64677    560  17451]
 [   205  20890   2941]
 [ 51662  19062 139875]]


In [14]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4
Note: you may need to restart the kernel to use updated packages.


In [12]:
# Installazione del pacchetto necessario (se non già installato)
# !pip install imbalanced-learn

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Applichiamo SMOTE al set di addestramento
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Controlla le dimensioni dopo il resampling
print("Distribuzione delle classi prima di SMOTE:", pd.Series(y_train).value_counts())
print("Distribuzione delle classi dopo SMOTE:", pd.Series(y_train_resampled).value_counts())

# Addestramento del modello con i dati bilanciati
best_rf = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=10,
    class_weight=None,  # Rimuoviamo il bilanciamento interno per evitare ridondanza
    random_state=42
)
best_rf.fit(X_train_resampled, y_train_resampled)

# Valutazione sul test set
y_pred_test = best_rf.predict(X_test)

# Metriche di valutazione
print("Classification Report (RF con SMOTE):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (RF con SMOTE):")
print(confusion_matrix(y_test, y_pred_test))


Distribuzione delle classi prima di SMOTE: 2    839626
0    333017
1     96648
Name: count, dtype: int64
Distribuzione delle classi dopo SMOTE: 0    839626
2    839626
1    839626
Name: count, dtype: int64
Classification Report (RF con SMOTE):
              precision    recall  f1-score   support

           0       0.56      0.78      0.65     82688
           1       0.50      0.87      0.64     24036
           2       0.87      0.66      0.75    210599

    accuracy                           0.71    317323
   macro avg       0.64      0.77      0.68    317323
weighted avg       0.76      0.71      0.72    317323


Confusion Matrix (RF con SMOTE):
[[ 64502    563  17623]
 [   204  21008   2824]
 [ 51257  20099 139243]]
