# Dimensionality reduction

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import multiprocessing

random_state = 42
num_cpu = multiprocessing.cpu_count()
num_cpu

16

In [2]:
def read_data():
    df = pd.read_csv('data/fifa.csv')
    y = (df['Man of the Match'] == "Yes")
    feature_names = [i for i in df.columns if df[i].dtype in [np.int64]]
    X = df[feature_names]
    
    return train_test_split(X, y, random_state=random_state)

## Principal Component Analysis

Linear dimensionality reduction using Singular Valud Decomposition of the data to project it to a lower dimensional space.

* if the data is not linearly separable, use KernelPCA
* if the data is sparse: use TruncatedSVD

In [3]:
X_train, X_test, y_train, y_test = read_data()

In [4]:
X_train.head()

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
126,4,39,8,6,1,1,2,1,14,1,75,271,99,14,2,0,0,0
24,3,61,15,6,7,2,9,1,21,2,89,544,102,17,3,0,0,0
67,1,39,8,1,6,1,2,3,8,5,82,357,106,16,2,0,0,0
111,1,51,16,2,9,5,7,2,24,3,82,571,143,13,2,0,0,4
89,1,46,11,2,5,4,7,0,12,3,83,462,80,8,0,0,0,0


In [5]:
X_train.describe()

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,1.302083,50.708333,12.354167,3.760417,5.15625,3.375,4.78125,1.333333,15.197917,2.541667,82.75,475.90625,106.78125,13.5625,1.729167,0.010417,0.020833,0.239583
std,1.134342,9.817189,5.131901,2.145962,2.522856,2.367544,2.313362,1.092237,4.800482,1.90797,5.395905,150.822522,12.223535,4.414837,1.260569,0.102062,0.143576,0.879531
min,0.0,25.0,3.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,67.0,194.0,80.0,5.0,0.0,0.0,0.0,0.0
25%,0.75,43.0,8.0,2.0,3.0,1.75,3.0,1.0,11.0,1.0,79.0,363.75,101.0,10.75,1.0,0.0,0.0,0.0
50%,1.0,51.5,12.0,3.0,5.0,3.0,5.0,1.0,15.0,2.0,83.0,475.0,104.0,13.5,2.0,0.0,0.0,0.0
75%,2.0,58.0,15.0,5.0,7.0,4.0,6.0,2.0,18.0,3.25,87.0,559.0,108.0,16.0,2.0,0.0,0.0,0.0
max,6.0,75.0,26.0,12.0,11.0,10.0,11.0,4.0,26.0,9.0,94.0,1137.0,148.0,24.0,6.0,1.0,1.0,4.0


In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [7]:
pca = PCA(n_components=5, random_state=random_state)
principal_components = pca.fit_transform(X_train)

In [8]:
pca.explained_variance_ratio_

array([0.26592701, 0.12373013, 0.08229868, 0.08046827, 0.07463512])

These components shoulldn't be used because we lost too much information

In [9]:
df = pd.read_csv('data/iris.csv', index_col='Id')
df.head()

Unnamed: 0_level_0,Sepal Length (cm),Sepal Width (cm),Petal Length (cm),Petal Width (cm),Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
features = ['Sepal Length (cm)', 'Sepal Width (cm)', 'Petal Length (cm)', 'Petal Width (cm)']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df["Species"])

In [12]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
principal_df = pd.DataFrame(data=principal_components, columns=["PC1", "PC2"])
principal_df.head()

Unnamed: 0,PC1,PC2
0,1.0069,-1.392834
1,2.728089,0.740575
2,1.998298,0.589342
3,1.129815,-0.6934
4,0.914315,0.021563


In [14]:
X_test[:5, :]

array([[ 0.22098316, -0.36652901],
       [-2.19184006, -0.65935506],
       [ 1.58127854,  0.66490586],
       [-2.53443298,  1.86207622],
       [ 0.24558718, -0.58112351]])

In [15]:
pca.explained_variance_ratio_

array([0.72618385, 0.23325304])

This is much more representative of the data, we didn't lose much information.

We can noow use this lowered dimension data for training a model.