In [46]:
import pandas as pd
import sys
from IPython.core import ultratb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

# ensure error messages are color coded using IPython color schema ----
sys.excepthook = ultratb.FormattedTB(mode="Verbose",
                                     color_scheme="Linux",
                                     call_pdb=False)

# load necessary data ----
fifa_df = pd.read_csv("write_data/all_players.csv").drop("Unnamed: 0", axis=1)

# store relevant columns ----
relevant_columns = ['volleys', 'dribbling', 'curve',
                    'fkaccuracy', 'longpassing', 'ballcontrol',
                    'acceleration', 'sprintspeed', 'agility',
                    'reactions', 'balance', 'shotpower',
                    'jumping', 'stamina', 'strength',
                    'longshots', 'aggression', 'interceptions',
                    'positioning', 'vision', 'penalties',
                    'composure', 'marking', 'standingtackle',
                    'slidingtackle', 'gkdiving', 'gkhandling',
                    'gkkicking', 'gkpositioning', 'gkreflexes']

# clean up data frame to only contain continuous variables ----
fifa_df.columns = [name.lower().replace(" ", "_")
                   for name in fifa_df.columns]

fifa_df = (fifa_df
           .set_index("player_name")
           .filter(relevant_columns)
           .dropna())

print(fifa_df.head())

# scale it ----
fifa_scaled_df = (StandardScaler()
                  .fit(fifa_df)
                  .transform(fifa_df))

print(fifa_scaled_df)

# instatiate PCA object ----
pca = PCA(n_components=2, random_state=10)

# fit fifa_df onto pca ----
pca.fit(fifa_scaled_df)


                              volleys  dribbling  acceleration  agility  \
player_name                                                               
Lionel Messi                       86         97            92       91   
C. Ronaldo dos Santos Aveiro       87         90            89       87   
Kevin De Bruyne                    82         88            79       79   
Luis Suárez                        88         88            87       82   
Sergio Ramos García                67         73            77       79   

                              reactions  balance  jumping  stamina  strength  \
player_name                                                                    
Lionel Messi                         95       95       69       73        59   
C. Ronaldo dos Santos Aveiro         96       70       95       88        79   
Kevin De Bruyne                      92       77       63       91        75   
Luis Suárez                          94       83       69       91        

  interactivity=interactivity, compiler=compiler, result=result)


PCA(copy=True, iterated_power='auto', n_components=2, random_state=10,
    svd_solver='auto', tol=0.0, whiten=False)

In [47]:
pca.components_

array([[-0.29631008, -0.34170037, -0.26481403, -0.28040284, -0.19095987,
        -0.24633445, -0.11908536, -0.29498207, -0.03261581, -0.22710584,
        -0.17181444, -0.32060497, -0.27709773, -0.28635554, -0.26791707,
        -0.18120732],
       [-0.16168626, -0.04988822, -0.21231598, -0.24290601,  0.15910535,
        -0.24855616,  0.18238552,  0.14082615,  0.42346465,  0.36937443,
         0.42590518, -0.15143829, -0.12282207, -0.1315252 ,  0.12739544,
         0.39798726]])

In [48]:
fifa_scaled_df[0]

array([ 2.39134725,  1.87715538,  1.81145112,  1.8114871 ,  3.63647645,
        2.16745969,  0.26890088,  0.56499377, -0.58768917, -0.49967924,
       -1.22615468,  2.26918662,  2.88583259,  1.6698235 ,  3.34219873,
       -0.70649289])

In [49]:
fifa_scaled_df.shape

(15397, 16)

In [51]:
np.matmul(fifa_scaled_df, pca.components_[0])

array([-6.19409971, -6.22944653, -6.24189154, ...,  4.15953485,
        3.16518076,  2.0487057 ])

In [52]:
pc_df = pd.DataFrame({'pc1': np.matmul(fifa_scaled_df, pca.components_[0]),
                     'pc2': np.matmul(fifa_scaled_df, pca.components_[1])},
                     index=fifa_df.index)



In [53]:
pc_df

Unnamed: 0_level_0,pc1,pc2
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Lionel Messi,-6.194100,-2.865224
C. Ronaldo dos Santos Aveiro,-6.229447,-0.737926
Kevin De Bruyne,-6.241892,0.461547
Luis Suárez,-6.610756,0.237446
Sergio Ramos García,-4.590720,3.192977
...,...,...
Shandon Baptiste,2.631306,-0.738212
David Norman Jr.,2.299992,-0.244658
Noah Christoffersson,4.159535,-1.076543
Kieron Olsen,3.165181,-0.950134


In [44]:
pc_df.to_csv("write_data/pc3_all_players.csv", index=True)

In [45]:
pca.explained_variance_ratio_

array([0.47101448, 0.1819183 , 0.09180653])