In [11]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import matplotlib.pyplot as plt

In [3]:
wine_quality = fetch_ucirepo(id=186)

# Pre-processing
df = wine_quality.data.original

def categorize_quality(quality):
    if quality < 7:
        return 'low'
    else:
        return 'high'


df['quality_category'] = df['quality'].apply(categorize_quality)

red_wine_df = df[df['color'] == 'red']
white_wine_df = df[df['color'] == 'white']

df_array = [red_wine_df, white_wine_df, df]

In [13]:

rf = RandomForestClassifier(random_state=42)
df_names = ['Red Wine', 'White Wine', 'Combined Wine']

def confusion_matrix_plot(y_test, y_pred, clf, model):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)

    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix for DataFrame {model.__class__.__name__}')
    plt.show()

def optimized_folds(model, X, y):
    best_score = 0
    best_folds = 0
    
    for folds in np.arange(2, 11):
        scores = cross_val_score(model, X, y, cv=folds, scoring='f1_macro')
        if scores.mean() > best_score:
            best_score = scores.mean()
            best_folds = folds
    
    print(f"Best Folds: {best_folds} | Best Score: {best_score}", end=" | ")
    return best_folds




In [None]:
for i, df in enumerate(df_array):
    print("=" * 40)
    print(f"Processing {df_names[i]} DataFrame")
    print("=" * 40)
    
    X = df.drop(['quality', 'quality_category', 'color'], axis=1)
    y = df['quality_category']

    fold_options = np.arange(5, 15)
    best_folds = optimized_folds(rf, X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Randomized Search CV (faster alternative to Grid Search)
    param_dist = {
        'n_estimators': np.arange(50, 300, 50),
        'max_depth': [None] + list(np.arange(5, 50, 5)),
        'min_samples_split': np.arange(2, 20, 2),
        'min_samples_leaf': np.arange(1, 10, 1),
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    }

    rf = RandomForestClassifier(random_state=42)

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        cv=best_folds,
        n_jobs=-1,  # Use all available cores
        scoring='accuracy',
        random_state=42,
        verbose=1 # how much output to show during the search
    )

    random_search.fit(X_train, y_train)

    print(f"Best parameters: {random_search.best_params_}")

    # Evaluation metrics
    y_pred = random_search.predict(X_test)

    print(classification_report(y_test, y_pred))
    confusion_matrix_plot(y_test, y_pred, random_search, rf)


Processing Red Wine DataFrame
Best Folds: 9 | Best Score: 0.6854182647328406 | Fitting 9 folds for each of 10 candidates, totalling 90 fits
