# Import Dependencies

In [70]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

## Load the Dataset with High Dimentionality

In [56]:
data = pd.read_csv('winequality-red.csv')
X = data.drop('quality', axis=1)
y = data['quality']
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


### Spliting the dataset into train and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# PCA
Principal component analysis, or PCA, is a dimensionality reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set.

![local](https://builtin.com/sites/www.builtin.com/files/inline-images/national/Principal%2520Component%2520Analysis%2520second%2520principal.gif)

In [7]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

# LDA
Linear Discriminant Analysis (LDA), also known as Normal Discriminant Analysis or Discriminant Function Analysis, is a dimensionality reduction technique primarily utilized in supervised classification problems. It facilitates the modeling of distinctions between groups, effectively separating two or more classes. LDA operates by projecting features from a higher-dimensional space into a lower-dimensional one. In machine learning, LDA serves as a supervised learning algorithm specifically designed for classification tasks, aiming to identify a linear combination of features that optimally segregates classes within a dataset.

In [9]:
lda = LDA(n_components=2)
X_lda = lda.fit_transform(X_train, y_train)
X_lda_test = lda.transform(X_test)

# TSNE
t-SNE (t-distributed Stochastic Neighbor Embedding) is an unsupervised non-linear dimensionality reduction technique for data exploration and visualizing high-dimensional data. Non-linear dimensionality reduction means that the algorithm allows us to separate data that cannot be separated by a straight line.

![Local](https://images.datacamp.com/image/upload/v1718119884/image_bf48957586.gif)

In [11]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_train)
X_tsne_test = tsne.fit_transform(X_test)

# ICA
Independent Component Analysis (ICA) is a statistical and computational technique used in machine learning to separate a multivariate signal into its independent non-Gaussian components. The goal of ICA is to find a linear transformation of the data such that the transformed data is as close to being statistically independent as possible.



In [13]:
ica = FastICA(n_components=2)
X_ica = ica.fit_transform(X_train)
X_ica_test = ica.transform(X_test)

# K-Best
SelectKBest is a feature selection technique in machine learning that is available in the scikit-learn library (sklearn), which is a popular Python library for machine learning and data analysis. Feature selection is the process of choosing a subset of the most relevant features (variables or attributes) from a dataset to use in a machine learning model. This can be useful for improving model performance, reducing overfitting, and speeding up training times.

In [15]:
select_k_best = SelectKBest(chi2, k=10)
X_kbest = select_k_best.fit_transform(X_train, y_train)
X_kbest_test = select_k_best.transform(X_test)

# RFE
Recursive Feature Elimination (RFE) is a method to iteratively remove less significant features, focusing on those that enhance predictive accuracy. RFE ranks feature importance, removes the least important, and rebuilds the model until a desired feature subset is obtained.

In [17]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
X_rfe = rfe.fit_transform(X_train, y_train)
X_rfe_test = rfe.transform(X_test)

In [19]:
def evaluate_model(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

**From here you can see the difference between the original dataset evaluation and with one of the dimention reduction tech evaluation**

In [23]:
print("Original Data")
evaluate_model(X_train, X_test, y_train, y_test)

Original Data
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        17
           5       0.72      0.75      0.73       195
           6       0.62      0.69      0.65       200
           7       0.56      0.46      0.50        61
           8       0.50      0.17      0.25         6

    accuracy                           0.65       480
   macro avg       0.40      0.34      0.36       480
weighted avg       0.63      0.65      0.64       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
print("\nPCA")
evaluate_model(X_pca, X_pca_test, y_train, y_test)


PCA
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.22      0.12      0.15        17
           5       0.60      0.69      0.64       195
           6       0.60      0.59      0.60       200
           7       0.50      0.38      0.43        61
           8       0.50      0.17      0.25         6

    accuracy                           0.58       480
   macro avg       0.40      0.32      0.34       480
weighted avg       0.57      0.58      0.57       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
print("\nLDA")
evaluate_model(X_lda, X_lda_test, y_train, y_test)


LDA
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.67      0.12      0.20        17
           5       0.70      0.76      0.73       195
           6       0.60      0.60      0.60       200
           7       0.44      0.44      0.44        61
           8       1.00      0.17      0.29         6

    accuracy                           0.62       480
   macro avg       0.57      0.35      0.38       480
weighted avg       0.62      0.62      0.61       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
print("\nT-SNE")
evaluate_model(X_tsne, X_tsne_test, y_train, y_test)


T-SNE
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        17
           5       0.45      0.48      0.47       195
           6       0.44      0.49      0.46       200
           7       0.11      0.08      0.09        61
           8       0.00      0.00      0.00         6

    accuracy                           0.41       480
   macro avg       0.17      0.18      0.17       480
weighted avg       0.38      0.41      0.40       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
print("\nICA")
evaluate_model(X_ica, X_ica_test, y_train, y_test)


ICA
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.09      0.06      0.07        17
           5       0.55      0.64      0.59       195
           6       0.57      0.55      0.56       200
           7       0.51      0.41      0.45        61
           8       0.33      0.17      0.22         6

    accuracy                           0.54       480
   macro avg       0.34      0.30      0.32       480
weighted avg       0.53      0.54      0.54       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
print("\nSelectKBest")
evaluate_model(X_kbest, X_kbest_test, y_train, y_test)


SelectKBest
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        17
           5       0.70      0.80      0.75       195
           6       0.64      0.66      0.65       200
           7       0.54      0.44      0.49        61
           8       0.50      0.17      0.25         6

    accuracy                           0.66       480
   macro avg       0.40      0.34      0.36       480
weighted avg       0.63      0.66      0.64       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
print("\nRFE")
evaluate_model(X_rfe, X_rfe_test, y_train, y_test)


RFE
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        17
           5       0.72      0.76      0.74       195
           6       0.62      0.69      0.65       200
           7       0.57      0.48      0.52        61
           8       0.50      0.17      0.25         6

    accuracy                           0.66       480
   macro avg       0.40      0.35      0.36       480
weighted avg       0.63      0.66      0.64       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Let's Try for another example

In [83]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_train, y_train)
X_lda_test = lda.transform(X_test)

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_train)
X_tsne_test = tsne.fit_transform(X_test)

ica = FastICA(n_components=2)
X_ica = ica.fit_transform(X_train)
X_ica_test = ica.transform(X_test)

select_k_best = SelectKBest(chi2, k=10)
X_kbest = select_k_best.fit_transform(X_train, y_train)
X_kbest_test = select_k_best.transform(X_test)

rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
X_rfe = rfe.fit_transform(X_train, y_train)
X_rfe_test = rfe.transform(X_test)

def evaluate_model(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

print("Original Data")
evaluate_model(X_train, X_test, y_train, y_test)

print("\nPCA")
evaluate_model(X_pca, X_pca_test, y_train, y_test)

print("\nLDA")
evaluate_model(X_lda, X_lda_test, y_train, y_test)

print("\nT-SNE")
evaluate_model(X_tsne, X_tsne_test, y_train, y_test)

print("\nICA")
evaluate_model(X_ica, X_ica_test, y_train, y_test)

print("\nSelectKBest")
evaluate_model(X_kbest, X_kbest_test, y_train, y_test)

print("\nRFE")
evaluate_model(X_rfe, X_rfe_test, y_train, y_test)

Original Data
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171


PCA
              precision    recall  f1-score   support

           0       0.94      0.95      0.94        63
           1       0.97      0.96      0.97       108

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171


LDA
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        63
           1       0.98      0.94      0.96       108

    accuracy                           0.95       171
   macro avg       0.95      0.96      0.95       171
weighted avg       0.95      0.95      0.95       