# Wine Classification — Extended Tuning & XGBoost Comparison

This notebook extends the previous work by adding RandomizedSearchCV for faster hyperparameter search, training and comparing an XGBoost classifier with Random Forest, and saving the best models. It also includes evaluation, plots, and notes for deployment.


In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import joblib
import matplotlib.pyplot as plt
print("Libraries imported.")

Libraries imported.


In [2]:
# Load dataset
df = pd.read_csv('wine_dataset.csv')
display(df.head())

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [3]:
# Detect target and prepare features
for t in ['target','class','Type']:
    if t in df.columns:
        target = t
        break
else:
    target = df.columns[-1]

X = df.drop(columns=[target]).select_dtypes(include=[np.number])
y = df[target]
print('Feature shape:', X.shape, 'Target distribution:\n', y.value_counts())

Feature shape: (178, 13) Target distribution:
 target
1    71
0    59
2    48
Name: count, dtype: int64


In [4]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Baseline Random Forest CV

In [5]:
rf = RandomForestClassifier(random_state=42)
rf_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='accuracy')
print('RF CV mean accuracy: {:.4f} ± {:.4f}'.format(rf_scores.mean(), rf_scores.std()))

RF CV mean accuracy: 0.9791 ± 0.0277


## GridSearchCV for Random Forest (detailed grid)

In [6]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
print('Best RF params:', grid_rf.best_params_)
print('Best RF CV score:', grid_rf.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


## RandomizedSearchCV for Random Forest (faster, wider)

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_dist = {
    'n_estimators': sp_randint(50, 301),
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': sp_randint(2, 11),
    'min_samples_leaf': sp_randint(1, 5)
}
rand_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=30, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)
rand_rf.fit(X_train, y_train)
print('Best Randomized RF params:', rand_rf.best_params_)
print('Best Randomized RF CV score:', rand_rf.best_score_)

## XGBoost baseline and tuning

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=cv, scoring='accuracy')
print('XGB CV mean accuracy: {:.4f} ± {:.4f}'.format(xgb_scores.mean(), xgb_scores.std()))

In [None]:
# RandomizedSearchCV for XGBoost (recommended)
from scipy.stats import uniform, randint
param_dist_xgb = {
    'n_estimators': randint(50, 301),
    'max_depth': randint(3, 11),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}
rand_xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
                              param_distributions=param_dist_xgb, n_iter=40, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)
rand_xgb.fit(X_train, y_train)
print('Best XGB params:', rand_xgb.best_params_)
print('Best XGB CV score:', rand_xgb.best_score_)

## Evaluate best models on test set

In [None]:
# Choose best from grid and randomized (prefer grid_rf.best_estimator_ if available)
best_rf = grid_rf.best_estimator_ if hasattr(grid_rf, 'best_estimator_') else rand_rf.best_estimator_
best_xgb = rand_xgb.best_estimator_ if hasattr(rand_xgb, 'best_estimator_') else xgb

for name, model in [('RandomForest', best_rf), ('XGBoost', best_xgb)]:
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"{name} Test Accuracy: {acc:.4f}, F1-weighted: {f1:.4f}")
    print(classification_report(y_test, y_pred))

## Save best models

In [None]:
joblib.dump(best_rf, 'best_rf.joblib')
joblib.dump(best_xgb, 'best_xgb.joblib')
print('Saved best_rf.joblib and best_xgb.joblib')

## Plots (feature importances, confusion matrix example)

In [None]:
# Feature importance for RF
importances = best_rf.feature_importances_
idx = np.argsort(importances)[::-1]
plt.figure(figsize=(8,5))
plt.bar(range(len(importances)), importances[idx])
plt.xticks(range(len(importances)), X.columns[idx], rotation=90)
plt.title('Random Forest Feature Importances')
plt.tight_layout()
plt.show()