In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2


In [None]:
# --------------------------
# 1. Load dataset
# --------------------------
data = load_wine(as_frame=True)
X = data.data
y = data.target

print("Shape of dataset:", X.shape)
print("Classes:", np.unique(y))

Shape of dataset: (178, 13)
Classes: [0 1 2]


In [None]:

# --------------------------
# 2. Feature Scaling
# --------------------------

# (a) Min-Max Scaling
minmax_scaler = MinMaxScaler()
X_minmax = pd.DataFrame(minmax_scaler.fit_transform(X), columns=X.columns)

# (b) Standardization
std_scaler = StandardScaler()
X_std = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns)

print("\nScaled Feature Sample (Standardized):")
print(X_std.head())



Scaled Feature Sample (Standardized):
    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  1.518613   -0.562250  0.232053          -1.169593   1.913905   
1  0.246290   -0.499413 -0.827996          -2.490847   0.018145   
2  0.196879    0.021231  1.109334          -0.268738   0.088358   
3  1.691550   -0.346811  0.487926          -0.809251   0.930918   
4  0.295700    0.227694  1.840403           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  
0         0.251717  0.362177                      1.847920  

In [None]:
# --------------------------
# 3. Dimensionality Reduction (PCA)
# --------------------------
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_std)

print("\nExplained Variance Ratio by 3 components:", pca.explained_variance_ratio_)
print("Total variance explained:", np.sum(pca.explained_variance_ratio_))


Explained Variance Ratio by 3 components: [0.36198848 0.1920749  0.11123631]
Total variance explained: 0.6652996889318524


In [None]:
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]

print("\nHighly correlated features (corr > 0.85):", high_corr_features)

X_filtered = X.drop(columns=high_corr_features, errors='ignore')
selector = SelectKBest(score_func=chi2, k=5)
X_chi2 = selector.fit_transform(X_minmax, y)
selected_features = X_minmax.columns[selector.get_support()].tolist()

print("\nTop 5 features selected by Chi-square test:", selected_features)


Highly correlated features (corr > 0.85): ['flavanoids']

Top 5 features selected by Chi-square test: ['alcohol', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']


In [None]:
print("\n--- SUMMARY ---")
print(f"Original feature count: {X.shape[1]}")
print(f"After correlation filter: {X_filtered.shape[1]}")
print(f"Top chi2 features: {selected_features}")
print(f"PCA components: 3, explaining {np.sum(pca.explained_variance_ratio_)*100:.2f}% variance")


--- SUMMARY ---
Original feature count: 13
After correlation filter: 12
Top chi2 features: ['alcohol', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']
PCA components: 3, explaining 66.53% variance
