#importing Dataset and libraries



In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
data = load_breast_cancer()

X = data.data
y = data.target
feature_names = data.feature_names


In [None]:
df=pd.DataFrame(X,columns=feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


#Splitting data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#Chi-square

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
chi2_selector = SelectKBest(score_func=chi2, k=10)
X_train_chi2 = chi2_selector.fit_transform(X_train_scaled, y_train)
X_test_chi2 = chi2_selector.transform(X_test_scaled)

chi2_features = feature_names[chi2_selector.get_support()]
print("Selected Chi-Square Features:\n", chi2_features)


Selected Chi-Square Features:
 ['mean radius' 'mean perimeter' 'mean area' 'mean concavity'
 'mean concave points' 'worst radius' 'worst perimeter' 'worst area'
 'worst concavity' 'worst concave points']


#Information gain

In [None]:
ig_selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_ig = ig_selector.fit_transform(X_train, y_train)
X_test_ig = ig_selector.transform(X_test)

ig_features = feature_names[ig_selector.get_support()]
print("Selected Information Gain Features:\n", ig_features)


Selected Information Gain Features:
 ['mean radius' 'mean perimeter' 'mean area' 'mean concavity'
 'mean concave points' 'area error' 'worst radius' 'worst perimeter'
 'worst area' 'worst concave points']


#PCA

In [None]:

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)


In [None]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Variance Explained:", np.sum(pca.explained_variance_ratio_))


Explained Variance Ratio: [0.43502782 0.19500007 0.09781519 0.06486409 0.05253378 0.041128
 0.0223559  0.01647952 0.01380052 0.01205262]
Total Variance Explained: 0.9510575059327271


#TSNE

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_std)


In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred)
print("Baseline Accuracy:", baseline_acc)


Baseline Accuracy: 0.9649122807017544


#Checking accuracy

In [None]:
rf.fit(X_train_chi2, y_train)
y_pred_chi2 = rf.predict(X_test_chi2)

chi2_acc = accuracy_score(y_test, y_pred_chi2)
print("Chi-Square Accuracy:", chi2_acc)


Chi-Square Accuracy: 0.956140350877193


In [None]:
rf.fit(X_train_ig, y_train)
y_pred_ig = rf.predict(X_test_ig)

ig_acc = accuracy_score(y_test, y_pred_ig)
print("Information Gain Accuracy:", ig_acc)


Information Gain Accuracy: 0.956140350877193


In [None]:
rf.fit(X_train_pca, y_train)
y_pred_pca = rf.predict(X_test_pca)

pca_acc = accuracy_score(y_test, y_pred_pca)
print("PCA Accuracy:", pca_acc)


PCA Accuracy: 0.9473684210526315


In [None]:
comparison = pd.DataFrame({
    "Method": ["All Features", "Chi-Square", "Information Gain", "PCA"],
    "Accuracy": [baseline_acc, chi2_acc, ig_acc, pca_acc]
})

print(comparison)


             Method  Accuracy
0      All Features  0.964912
1        Chi-Square  0.956140
2  Information Gain  0.956140
3               PCA  0.947368


Feature selection using Chi-Square and Information Gain reduced the number of input features while maintaining competitive model performance. Information Gain performed better than Chi-Square as it was able to capture non-linear relationships between features and the target variable. PCA successfully reduced dimensionality and retained most of the variance, but a slight drop in accuracy was observed due to loss of interpretability. t-SNE was useful for visualizing class separation but was not suitable for direct model training. Overall, feature selection methods preserved interpretability, while PCA helped in reducing dimensionality efficiently.