In [None]:
%matplotlib widget

import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from IPython.display import display, Math


## Lectura y estandarización de datos

In [None]:
data = pd.read_csv('./dataset/europe.csv', header=0)

data_no_countries = data.drop(['Country'], axis=1)
# Plot de data sin escalar

scaler = StandardScaler().fit(data_no_countries)
scaled_data = scaler.transform(data_no_countries)
# Plot de data escalado

print("Mean:")
print(scaled_data.mean(axis=0))

print("Standard deviation:")
print(scaled_data.std(axis=0))

## PCA

In [None]:
pca = PCA()

pca.fit(scaled_data)

components = pca.components_

first_component = pd.DataFrame(components[0]).T
first_component.columns = data_no_countries.columns
first_component.index = ['First component']
first_component

display(first_component)

variance_ratio = pd.DataFrame(pca.explained_variance_ratio_)
variance_ratio.index = ['λ1', 'λ2', 'λ3', 'λ4', 'λ5', 'λ6', 'λ7']
variance_ratio.columns = ['Variance ratio']
variance_ratio.insert(
    value=pca.explained_variance_ratio_.cumsum(), column='Cumulative variance', loc=1)

display(variance_ratio)

scaled_data_df = pd.DataFrame(scaled_data)
scaled_data_df.columns = data_no_countries.columns
scaled_data_df.index = data_no_countries.index


pc1_alignment = scaled_data_df.dot(first_component.T)

pc1_alignment.index = data['Country'].to_numpy()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 12))
pc1_alignment.plot.barh( y='First component', ax=axes[0])
first_component.T.plot.barh( y='First component', ax=axes[1])

plt.show()



## Biplot

In [None]:
second_component = components[1]
pc2_alignment = scaled_data_df.dot(second_component.T)

pc2_alignment.index = data['Country'].to_numpy()

plt.figure(figsize=(20, 12))
plt.scatter(pc1_alignment, pc2_alignment)
plt.xlabel('PC1')
plt.ylabel('PC2')

for i, txt in enumerate(data['Country'].to_numpy()):
    plt.annotate(txt, (pc1_alignment.to_numpy()[
                 i], pc2_alignment.to_numpy()[i]))

for x, y, col_name in zip(first_component.to_numpy()[0], second_component, first_component.columns):
    print(x, y)
    plt.arrow(0, 0, x, y)
    plt.annotate(col_name, (x, y))


plt.show()