In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from scipy import stats

# Step 2: Load Red Wine dataset
wine = pd.read_csv("winequality-red.csv", sep=";")
print("Initial Shape:", wine.shape)
print("\nFirst 5 rows:\n", wine.head())

# Step 3: Check info & missing values
print("\nInfo:\n")
print(wine.info())
print("\nMissing values:\n", wine.isnull().sum())

# Step 4: Handle missing values (if any)
wine.fillna(wine.median(numeric_only=True), inplace=True)

# Step 5: Remove duplicates
wine = wine.drop_duplicates()
print("After removing duplicates:", wine.shape)

# Step 6: Outlier detection & removal (Z-score method)
numeric_cols = wine.select_dtypes(include=[np.number]).columns
z_scores = np.abs(stats.zscore(wine[numeric_cols]))
wine = wine[(z_scores < 3).all(axis=1)]
print("After outlier removal:", wine.shape)

# Step 7: Feature Engineering (create quality categories)
wine['quality_label'] = pd.cut(
    wine['quality'],
    bins=[0, 4, 6, 10],      # 3 groups: Low(<=4), Medium(5-6), High(7+)
    labels=['Low', 'Medium', 'High']
)

# Step 8: Encode categorical target
wine['quality_label'] = wine['quality_label'].astype(str)
wine['quality_label_enc'] = LabelEncoder().fit_transform(wine['quality_label'])
# Low=1, Medium=2, High=0 (encoding order may vary)

# Step 9: Scale numeric features
scaler = StandardScaler()
wine[numeric_cols] = scaler.fit_transform(wine[numeric_cols])

# Step 10: Prepare features (X) and target (y)
X = wine.drop(['quality', 'quality_label', 'quality_label_enc'], axis=1)
y = wine['quality_label_enc']

# Step 11: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nFinal Dataset Shapes:")
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train:", y_train.shape, "y_test:", y_test.shape)

print("\nTarget distribution in train set:\n", y_train.value_counts(normalize=True))


Initial Shape: (1599, 12)

First 5 rows:
    fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
