# DATA ANALYSIS
*Studying data distribution with Box Plots and applying PCA.*

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

## 1. Box Plots

In [None]:
feat_df = pd.read_csv("2. Labeling/features_2.csv")
sectors_df = pd.read_csv("2. Labeling/sectors_2.csv")
df = pd.concat([feat_df, sectors_df], axis = 1)
features = list(feat_df.columns)
sectors = list(sectors_df.columns)

for f in features:
    val = []
    for s in sectors:
        val.append(df.loc[df[s] == 1, f].values)
    fig, ax = plt.subplots()
    fig.set_size_inches(18, 10)
    ax.boxplot(val, labels = sectors)
    ax.set_title(f.capitalize())

## 2. PCA

In [None]:
# Normalization of 'tempo' and 'loudness' features
feat_df["tempo"] = (feat_df["tempo"] - feat_df["tempo"].min()) / (feat_df["tempo"].max() - feat_df["tempo"].min())
feat_df["loudness"] = (feat_df["loudness"] - feat_df["loudness"].min()) / (feat_df["loudness"].max() - feat_df["loudness"].min())

pca = PCA()
pca.fit(feat_df)

var_ratio = pca.explained_variance_ratio
print(var_ratio)
print(np.cumsum(var_ratio))

# 4 components chosen since the cumulated variance was higher than 90%
pca = PCA(n_components = 4)
pc4 = pca.fit_transform(feat_df)
pc4_df = pd.DataFrame(pc4, columns = ["PC1","PC2","PC3","PC4"])
pc4_df.to_csv("pc4.csv", index = False)