# creating dataset without vdem columns


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

In [None]:
cm_features = pd.read_csv('../data/cm_features_v2.4.csv')

In [None]:
vdem_columns = cm_features.filter(regex='vdem')
vdem_columns

In [None]:
cm_features_reduced = cm_features.drop(columns=vdem_columns)

# creating dataset using PCA


In [None]:
# Creating scaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(vdem_columns)

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(vdem_columns)

In [None]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(vdem_columns)

In [None]:
#For convenience i renamed normalizer as a scaler
scaler = Normalizer()
scaled_data = scaler.fit_transform(vdem_columns)

In [None]:
pca = PCA()
pca.fit(scaled_data)

# Calculating cumulative dispersion 
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)


In [None]:
plt.plot(cumulative_explained_variance)
plt.xlabel('Amount of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
# selection of optimal variables for variance 90
target_variance = 0.90
optimal_n_components = np.argmax(cumulative_explained_variance >= target_variance) + 1
print(f'Amount of components needed for saving {target_variance * 100}% of dispersion: {optimal_n_components}')

In [None]:
pca = PCA(n_components=optimal_n_components)
vdem_pca = pca.fit_transform(scaled_data)

vdem_pca_df = pd.DataFrame(vdem_pca, columns=[f'vdem_pca_{i + 1}' for i in range(optimal_n_components)])

combined_data = cm_features_reduced.join(vdem_pca_df)

In [None]:
# combined_data.to_csv('../data/cm_features_v2.5.csv', index=False)

# creating dataset using ICA

In [None]:
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler

In [None]:
# Scaler
scaler = StandardScaler()
vdem_columns_centered = scaler.fit_transform(vdem_columns)

In [None]:
ica = FastICA(random_state=42)
vdem_ica = ica.fit_transform(vdem_columns_centered)

In [None]:
n_components = vdem_ica.shape[1]
vdem_ica_df = pd.DataFrame(vdem_ica, columns=[f'vdem_ica_{i + 1}' for i in range(n_components)])
combined_data = cm_features_reduced.join(vdem_ica_df)

In [None]:

# Saving data
# combined_data.to_csv('../data/cm_features_v2.5.csv', index=False)
# print("saved to ../data/cm_features_v2.5.csv")
# print("Done")

# creating dataset using NMF

In [None]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
vdem_columns_scaled = scaler.fit_transform(vdem_columns)

In [None]:
n_components = 10
nmf = NMF(n_components=n_components, random_state=42)
vdem_nmf = nmf.fit_transform(vdem_columns_scaled)

In [None]:
vdem_nmf_df = pd.DataFrame(vdem_nmf, columns=[f'vdem_nmf_{i + 1}' for i in range(n_components)])
combined_data = cm_features_reduced.join(vdem_nmf_df)

In [None]:
# combined_data.to_csv('../data/cm_features_v2.5.csv', index=False)
# print("saved to ../data/cm_features_v2.5.csv")
# print("Done")