<a href="https://colab.research.google.com/github/datascience-uniandes/data-analysis-tutorial/blob/master/fifa/dim-reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dimensionality Reduction for high-dimensional data visualization

MINE-4101: Applied Data Science  
Univerisdad de los Andes  
  
**Dataset:** FIFA
  
Last update: August, 2024

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Load the data

In [None]:
fifa_df = pd.read_csv('./data/fifa_clean.csv')

In [None]:
fifa_df.shape

In [None]:
fifa_df.dtypes

In [None]:
fifa_df.head()

### Filter the data

In [None]:
position_frecs = fifa_df['Preferred Position'].value_counts(dropna=False, normalize=True).cumsum()
position_frecs

In [None]:
fifa_df = fifa_df.loc[fifa_df['Preferred Position'].isin(position_frecs.index[:10].tolist())]

In [None]:
fifa_df.shape

### Feature selection and data preparation

In [None]:
# Filtering the column list by index
player_attributes = fifa_df.columns[12:46]

In [None]:
player_attributes

In [None]:
# For features selected, making a transformation from string to int
# Why is this required? These features have values like '80+9' or '70-3'

attribute2int = lambda x: sum([int(i) for i in x.replace('-', '+').split('+')]) if type(x) == str else x

for attribute in player_attributes:
    print('Transforming', attribute)
    fifa_df[attribute] = fifa_df[attribute].apply(attribute2int)

### Build principal components

In [None]:
# Let's build a pipeline to automate the process of transforming features before applying PCA
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5, random_state=42))
])

In [None]:
# Generating the principal components
fifa_components = pipeline.fit_transform(fifa_df[player_attributes])

In [None]:
fifa_components.shape

In [None]:
fifa_components

In [None]:
# What is the percentage of variance explained by each component
print(pipeline['pca'].explained_variance_ratio_)
print(pipeline['pca'].explained_variance_ratio_.sum())

In [None]:
pipeline['pca'].components_.T.shape

In [None]:
loadings = pipeline['pca'].components_.T
loadings_df = pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(pipeline['pca'].n_components_)], index=fifa_df[player_attributes].columns)

In [None]:
loadings_df

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(x=fifa_components[:,0], y=fifa_components[:,1], hue=fifa_df['Preferred Position'], size=1)
plt.legend(loc='upper right')
plt.show()