# Practical 1

Perform feature engineering and selection on a dataset by applying feature scaling techniques such as Min-Max scaling and standardisation, using PCA for dimensionality reduction, and implementing at least two feature selection methods like correlation-based filtering and chi-square test.

In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

In [9]:
# --- Load Dataset ---
data = load_wine()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

In [10]:
print("Original shape:", X.shape)
display(X.head())  # Colab-friendly preview

Original shape: (178, 13)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [11]:
# --- Feature Scaling ---

# (a) Min-Max Scaling
minmax_scaler = MinMaxScaler()
X_minmax = pd.DataFrame(minmax_scaler.fit_transform(X), columns=X.columns)
print("\nAfter Min-Max Scaling:", X_minmax.shape)
display(X_minmax.head())

# (b) Standardization (Z-score Scaling)
standard_scaler = StandardScaler()
X_standard = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
print("\nAfter Standardization Scaling:", X_standard.shape)
display(X_standard.head())


After Min-Max Scaling: (178, 13)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,0.842105,0.1917,0.572193,0.257732,0.619565,0.627586,0.57384,0.283019,0.59306,0.372014,0.455285,0.970696,0.561341
1,0.571053,0.205534,0.417112,0.030928,0.326087,0.575862,0.510549,0.245283,0.274448,0.264505,0.463415,0.78022,0.550642
2,0.560526,0.320158,0.700535,0.412371,0.336957,0.627586,0.611814,0.320755,0.757098,0.375427,0.447154,0.695971,0.646933
3,0.878947,0.23913,0.609626,0.319588,0.467391,0.989655,0.664557,0.207547,0.55836,0.556314,0.308943,0.798535,0.857347
4,0.581579,0.365613,0.807487,0.536082,0.521739,0.627586,0.495781,0.490566,0.444795,0.259386,0.455285,0.608059,0.325963



After Standardization Scaling: (178, 13)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


In [12]:
# --- Dimensionality Reduction using PCA ---
pca = PCA(n_components=5)  # Keep top 5 principal components
X_pca = pca.fit_transform(X_standard)

print("\nExplained variance ratio:", pca.explained_variance_ratio_)
print("Shape after PCA:", X_pca.shape)

pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(5)])
pca_df['target'] = y
display(pca_df.head())


Explained variance ratio: [0.36198848 0.1920749  0.11123631 0.0706903  0.06563294]
Shape after PCA: (178, 5)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,target
0,3.316751,1.443463,-0.165739,-0.215631,0.693043,0
1,2.209465,-0.333393,-2.026457,-0.291358,-0.257655,0
2,2.51674,1.031151,0.982819,0.724902,-0.251033,0
3,3.757066,2.756372,-0.176192,0.567983,-0.311842,0
4,1.008908,0.869831,2.026688,-0.409766,0.298458,0


In [13]:
# --- Feature Selection ---

# (a) Correlation-based Filtering
threshold = 0.9
corr = X_standard.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if (upper[col] > threshold).any()]
X_corr_filtered = X_standard.drop(columns=to_drop, errors='ignore')

print("\nDropped due to high correlation:", to_drop)
print("Features after correlation filtering:", X_corr_filtered.shape)

# (b) Chi-square Test (requires non-negative features)
X_chi2_scaled = MinMaxScaler().fit_transform(X)  # Chi-square requires positive values
chi2_selector = SelectKBest(chi2, k=5)
X_chi2_selected = chi2_selector.fit_transform(X_chi2_scaled, y)
selected_features = X.columns[chi2_selector.get_support()]

print("Top 5 features by Chi-square test:", list(selected_features))


Dropped due to high correlation: []
Features after correlation filtering: (178, 13)
Top 5 features by Chi-square test: ['alcohol', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']


In [14]:
# --- Summary ---

print("\nSummary:")
print(f"Original features: {X.shape[1]}")
print(f"After PCA: {X_pca.shape[1]}")
print(f"After correlation filtering: {X_corr_filtered.shape[1]}")
print(f"Top Chi-square features: {list(selected_features)}")


Summary:
Original features: 13
After PCA: 5
After correlation filtering: 13
Top Chi-square features: ['alcohol', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']
