In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from tqdm.notebook import tqdm

import scipy.stats as ss
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


from itertools import product

warnings.filterwarnings('ignore')
sns.set()

In [None]:
df = pd.read_csv('data/responses_.csv')

In [None]:
df.describe()

## Class Distribution

In [None]:
for col in df.columns:
    plt.hist(df[col])
    plt.xlabel('Count')
    plt.ylabel('Classes')
    plt.title(f'Class Distribution for Column {col}')
    plt.show()

## Principal Component Analysis

In [None]:
df_normalized=(df - df.mean()) / df.std() # Normalize

pca = PCA(n_components=df.shape[1])
pca.fit(df_normalized)

In [None]:
loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(len(df_normalized.columns))],
                        index=df.columns)

In [None]:
ax = sns.heatmap(loadings, 
                 cbar_kws={'label': 'Principal Axes'})

ax.figure.axes[-1].yaxis.label.set_size(14)
plt.title('PCA Components')
plt.show()

In [None]:
print(f'Number of features seen during fit: {pca.n_features_in_}')
print(f'Estimated Noise Covariance: {pca.noise_variance_}')

In [None]:
plt.plot(pca.explained_variance_ratio_[:15])
plt.stem(pca.explained_variance_ratio_[:15])
plt.ylabel('Explained Variance')
plt.xlabel('Components')
plt.show()

In [None]:
var_ratio = pca.explained_variance_ratio_
grad = np.diff(var_ratio)*-1

## Chi-Square Test

In [None]:
cat_var1  = list(df.columns)
cat_var2  = list(df.columns)

# Get All Possible Combinations
cat_var_prod = list(product(cat_var1,cat_var2, repeat = 1))

In [None]:
result = []

for i in tqdm(cat_var_prod):
    if i[0] != i[1]:
        result.append((i[0], i[1], list(ss.chi2_contingency(pd.crosstab(
            df[i[0]], df[i[1]])))[1]))

In [None]:
chi_test_output = pd.DataFrame(result, columns = ['var1', 'var2', 'coeff'])
chi_test_output['coeff'] = np.around(chi_test_output['coeff'].values, decimals=3)
pivot_ct = chi_test_output.pivot(index='var1', columns='var2', values='coeff')

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(pivot_ct, cmap='crest')

plt.show()