In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [6]:
# Part A: Load Cleaned Data
file_path = "data/processed/wine_quality_cleaned.csv"
try:
    df = pd.read_csv(file_path)
    print("Cleaned data loaded successfully.")
except FileNotFoundError:
    raise FileNotFoundError(f"Could not find file at {file_path}. "
                            "Please run 01_data_cleaning_eda.ipynb first.")


Cleaned data loaded successfully.


In [7]:
# Part B: Target Engineering
# --------------------------------------------------------
# Binary classification target: "Good quality wine" (quality >= 7)
df["good_quality"] = (df["quality"] >= 7).astype(int)

In [8]:
# Part C: Feature Engineering

# 1. Acid ratio (fixed/volatile) → balance of acids
df["acid_ratio"] = df["fixed acidity"] / (df["volatile acidity"] + 1e-6)

# 2. Interaction: density × alcohol
df["density_alcohol_interaction"] = df["density"] * df["alcohol"]

print("\nNew features created:")
print(df[["quality", "good_quality", "acid_ratio", "density_alcohol_interaction"]].head())



New features created:
   quality  good_quality  acid_ratio  density_alcohol_interaction
0        5             0   10.571413                      9.37932
1        5             0    8.863626                      9.76864
2        5             0   10.263144                      9.77060
3        6             0   39.999857                      9.78040
4        5             0   11.212104                      9.37932


In [11]:
# Part D: Train/Test Split
X = df.drop(["quality", "good_quality"], axis=1)
y_reg = df["quality"]       # regression target
y_cls = df["good_quality"]  # classification target

# Regression split (70/30, random_state ensures reproducibility)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.3, random_state=42
)

# Classification split (stratified due to imbalance)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.3, random_state=42, stratify=y_cls
)

print("\nData splitting complete.")
print(f"Regression → Train: {X_train_reg.shape}, Test: {X_test_reg.shape}")
print(f"Classification → Train: {X_train_cls.shape}, Test: {X_test_cls.shape}")


Data splitting complete.
Regression → Train: (3724, 14), Test: (1596, 14)
Classification → Train: (3724, 14), Test: (1596, 14)


In [12]:
# Check stratification
print("\nClass distribution (train vs test):")
print("Train:\n", y_train_cls.value_counts(normalize=True))
print("Test:\n", y_test_cls.value_counts(normalize=True))



Class distribution (train vs test):
Train:
 good_quality
0    0.810419
1    0.189581
Name: proportion, dtype: float64
Test:
 good_quality
0    0.81015
1    0.18985
Name: proportion, dtype: float64


In [None]:
# Part E: Baseline Models

# 1. Regression baseline → predict mean quality
dummy_reg = DummyRegressor(strategy="mean")
dummy_reg.fit(X_train_reg, y_train_reg)
y_pred_reg_dummy = dummy_reg.predict(X_test_reg)

rmse_dummy = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg_dummy))
print(f"\nRegression Baseline RMSE: {rmse_dummy:.2f} "
      "(≈ typical error in quality points)")


📊 Regression Baseline RMSE: 0.88 (≈ typical error in quality points)


In [15]:
# 2. Classification baseline → always predict majority class
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_cls, y_train_cls)
y_pred_cls_dummy = dummy_clf.predict(X_test_cls)

acc_dummy = accuracy_score(y_test_cls, y_pred_cls_dummy)
f1_dummy = f1_score(y_test_cls, y_pred_cls_dummy)
print(f"\nClassification Baseline Accuracy: {acc_dummy:.2%}")
print(f"Classification Baseline F1-Score: {f1_dummy:.2f}")
print("Note: High accuracy, but fails to detect good wines (positive class).")


Classification Baseline Accuracy: 81.02%
Classification Baseline F1-Score: 0.00
Note: High accuracy, but fails to detect good wines (positive class).


In [16]:
# Part F: Preprocessing Pipeline
# --------------------------------------------------------
# Separate numeric & categorical features
numeric_features = X_train_cls.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_cls.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nDetected {len(numeric_features)} numeric features.")
print(f"Detected {len(categorical_features)} categorical features.")


Detected 13 numeric features.
Detected 1 categorical features.


In [17]:
# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# ColumnTransformer → applies right transformer to right feature type
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"  # drop unused columns
)

In [19]:
# Fit on train, transform train & test
X_train_processed = preprocessor.fit_transform(X_train_cls)
X_test_processed = preprocessor.transform(X_test_cls)

print(f"\nPreprocessing complete.")
print(f"Processed train shape: {X_train_processed.shape}")
print(f"Processed test shape: {X_test_processed.shape}")


Preprocessing complete.
Processed train shape: (3724, 15)
Processed test shape: (1596, 15)
