# Import dependencies

In [None]:
# To mute annoying warnings in notebook
import warnings
import numpy as np

# For Data science
import pandas as pd

# For visualization
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import (
    preprocessing,
    decomposition,
)

warnings.filterwarnings("ignore")

# Getting data, observations
## Get dataset

In [None]:
# Get dataset from file
data = pd.read_csv(
    "../data/SouthGermanCredit.asc",
    delimiter=" ",
)

In [None]:
# Get data info
data.info()

# Preprocessing

## Scaling

In [None]:
# Scale data
scaler = preprocessing.StandardScaler()

X_scaled = scaler.fit_transform(data)

# Processing

## Decomposition

In [None]:
# Get principal component analyzer
pca = decomposition.PCA()

# Fit scaled data
X_pca = pca.fit(X_scaled)

# Get explained variance (amount of variance explained by each of the selected components)
explained_variance_ratio = pca.explained_variance_ratio_

# Get cumulative explained variance for retained features
cumulative_explained_variance_ratio = np.cumsum(explained_variance_ratio)

# Plot explained variance
sns.lineplot(data=explained_variance_ratio, label="variance", color="g", marker="o")

plt.xlabel("Number of principal components")
plt.ylabel("explained_variance_ratio")
axis_2 = plt.gca().twinx()

sns.lineplot(
    data=cumulative_explained_variance_ratio,
    label="cumulative variance",
    color="r",
    marker="s",
)

plt.title("Explained variance by principal components")
plt.ylabel("cumulative_explained_variance_ratio");

In [None]:
# Get amount of retained components with cumulative explained variance more than 70%
retained_components = np.argmax(cumulative_explained_variance_ratio >= 0.7)

retained_components

In [None]:
# Get analyzer for only components which cumulative explained variance is more than 70%
pca_reduced = decomposition.PCA(n_components=retained_components)

# Get decomposition for chosen components
X_reduced = pca_reduced.fit_transform(X_scaled)

# Get variance loss
loss = 1 - cumulative_explained_variance_ratio[retained_components]

print(f"Variance loss is {loss:.2f}")