In [11]:
import requests
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
DATASET_RED_WINE = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
DATASET_WHITE_WINE = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

In [4]:
def download_data():
    red_wine = requests.get(DATASET_RED_WINE).text
    white_wine = requests.get(DATASET_WHITE_WINE).text
    with open('winequality-red.csv', 'w') as f:
        f.write(red_wine)
    with open('winequality-white.csv', 'w') as f:
        f.write(white_wine)

In [5]:
def load_data():
    red_wine_df = pd.read_csv('winequality-red.csv', sep=';')
    white_wine_df = pd.read_csv('winequality-white.csv', sep=';')
    df = pd.concat([red_wine_df, white_wine_df])
    return df

In [6]:
def preprocess_data(df):
    df['quality_label'] = df['quality'].apply(lambda x: 0 if x <= 5 else 1)
    return df

In [9]:
def main():
    download_data()
    df = load_data()
    df = preprocess_data(df)

    # Split the data
    X = df.drop(['quality', 'quality_label'], axis=1)
    y = df['quality_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the Random Forest Classifier and tune hyperparameters using GridSearchCV
    params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    clf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(clf, params, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_scaled, y_train)

    # Evaluate the model
    best_clf = grid_search.best_estimator_
    y_pred = best_clf.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [12]:
if __name__ == '__main__':
    main()

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Accuracy: 0.8146153846153846
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.71      0.73       451
           1       0.85      0.87      0.86       849

    accuracy                           0.81      1300
   macro avg       0.80      0.79      0.79      1300
weighted avg       0.81      0.81      0.81      1300

