### Normalisation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer 

df="Dataframe to be filled"

# Split the data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Create Normalizer Object, fit on training data, normalise transform the test data set
norm = Normalizer()
X_train_norm = norm.fit_transform(X_train)
X_test_norm = norm.transfrom(X_test)




### Standardization


In [None]:
from sklearn.preprocessing import StandardScaler

# Split the data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Create a scaler object and fit training data to standardise it
sc = StandardScaler()
X_Train_stzd = sc. fit_transform(X_train)

# Only Standardise the test data
X_test_stzd = sc.transform(X_test)

### Feature Selection

In [7]:
# Setup and Data Loading

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data  # Features
Y= iris.target # Target Variable
feature_names = iris.feature_names

print("Feature Names:", feature_names)
print("X shape:", X.shape)



Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
X shape: (150, 4)


In [8]:
from sklearn.feature_selection import VarianceThreshold

# This method removes features with low variance

vt= VarianceThreshold(threshold=0)
X_vt = vt.fit_transform(X)
print("Original shape:", X.shape)
print("Shape after VarianceThreshold:", X_vt.shape)

Original shape: (150, 4)
Shape after VarianceThreshold: (150, 4)


In [9]:
# Univariate Feature Selection with SelectKBest

from sklearn.feature_selection import SelectKBest, f_classif

# Select best 2 features based on the ANOVA F test
selector = SelectKBest(score_func=f_classif,k=2)
X_kbest= selector.fit_transform(X,Y)

selected_indices = selector.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_indices]

print("Selected feature indices:", selected_indices)
print("Selected features:", selected_features)

Selected feature indices: [2 3]
Selected features: ['petal length (cm)', 'petal width (cm)']


In [11]:
# Wrapper Methods

# Recursive Feature Elimination (RFE)
# RFE removes features recursively and builds a model on the remaining attributes

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialise a logistic regression model
model = LogisticRegression(max_iter= 200, solver='liblinear')

# Use RFE to select top 2 features
rfe =RFE(estimator=model, n_features_to_select=2)
rfe.fit(X,Y)

# Get the selected feature indices and names
rfe_selected = rfe.get_support(indices=True)
rfe_features = [feature_names[i] for i in rfe_selected]

print("RFE selected feature indices:", rfe_selected)
print("RFE selected features:", rfe_features)

RFE selected feature indices: [1 3]
RFE selected features: ['sepal width (cm)', 'petal width (cm)']


In [12]:
# RFECV: RFE with Cross Validation

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X,Y)


print("Optimal number of features:", rfecv.n_features_)
rfecv_selected = rfecv.get_support(indices=True)
rfecv_features = [feature_names[i] for i in rfecv_selected]
print("RFECV selected features:", rfecv_features)

Optimal number of features: 4
RFECV selected features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Train a RandomForest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, Y)

# Use feature importances for selection; default threshold is the mean importance
sfm = SelectFromModel(rf, prefit=True)
X_sfm = sfm.transform(X)

sfm_selected = sfm.get_support(indices=True)
sfm_features = [feature_names[i] for i in sfm_selected]

print("SelectFromModel chosen feature indices:", sfm_selected)
print("SelectFromModel chosen features:", sfm_features)


SelectFromModel chosen feature indices: [2 3]
SelectFromModel chosen features: ['petal length (cm)', 'petal width (cm)']
