In [74]:
from pathlib import Path
import pandas as pd
import sklearn as skl
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import plotly.express as px
from sklearn.decomposition import PCA


In [7]:
with open(Path('data/embeddings.csv')) as f:
    embeddings = pd.read_csv(f)

with open(Path('data/radiohead_extended.csv')) as f:
    radiohead = pd.read_csv(f)

## Dimension reduction

### Variance threshold

In [13]:
embeddings_vector = np.array(embeddings.iloc[:,1:])

In [55]:
desc.loc[['std','mean'],:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
std,0.153757,0.151143,0.158133,0.161337,0.170083,0.168914,0.165712,0.155099,0.163607,0.164633,...,0.167213,0.16791,0.13995,0.161547,0.16464,0.158794,0.168045,0.1609,0.160554,0.147896
mean,0.057006,0.061507,0.012814,0.073465,-0.07982,-0.004195,0.034183,-0.073542,0.078215,0.12241,...,-0.04907,0.034573,-0.102055,0.004679,-0.040945,-0.033365,0.004601,-0.046859,0.013257,0.007876


Visualizing mean and variance to have a feeling of each dimension's distribution

In [87]:
desc = embeddings.describe()
desc_array = desc.loc[['std', 'mean'],:].T
desc_array['var'] = desc.apply(lambda x: (x['std'] * x['std']))
px.scatter(desc_array, y='var', x='mean')

Visualizing data after throwing lower 50 percentile of variance

In [88]:
var_perc50 = np.percentile(desc_array['var'], 50)
sel = VarianceThreshold(var_perc50)
filtered = sel.fit_transform(embeddings.iloc[:,1:])
filtered.shape

(1755, 148)

In [89]:
desc = pd.DataFrame(filtered).describe()
desc_array = desc.loc[['std', 'mean'],:].T
desc_array['var'] = desc.apply(lambda x: (x['std'] * x['std']))
px.scatter(desc_array, y='var', x='mean')

In [77]:
pca = PCA(n_components=filtered.shape[1])
pca.fit(filtered)

In [80]:
pca.explained_variance_ratio_

array([0.03942775, 0.03058512, 0.02862924, 0.02256624, 0.02146725,
       0.01988268, 0.01908596, 0.01760903, 0.0167736 , 0.01619936,
       0.01563722, 0.01467925, 0.0136309 , 0.01352501, 0.01283384,
       0.01243482, 0.01209473, 0.01184337, 0.01117423, 0.01095075,
       0.01057252, 0.01043976, 0.01014518, 0.00993391, 0.00984462,
       0.0094165 , 0.00935815, 0.0091332 , 0.00897315, 0.00886572,
       0.00870389, 0.00856791, 0.00847756, 0.00834896, 0.00832268,
       0.00816393, 0.00792621, 0.00779343, 0.00768872, 0.00762987,
       0.00759689, 0.0075636 , 0.00739518, 0.00729243, 0.0071833 ,
       0.00701885, 0.00699459, 0.0069055 , 0.0068618 , 0.0067103 ,
       0.006666  , 0.00653898, 0.00649626, 0.00632474, 0.00626225,
       0.00620775, 0.00615717, 0.00610238, 0.00596627, 0.00594499,
       0.00590913, 0.00587681, 0.00575757, 0.00573503, 0.00567426,
       0.00555123, 0.00549696, 0.00545486, 0.00539854, 0.00536418,
       0.00526437, 0.00524981, 0.00508432, 0.00506361, 0.00500

In [81]:
pca.mean_

array([-0.0798203 , -0.00419546,  0.03418313,  0.07821544,  0.12240973,
       -0.00723206, -0.1284051 ,  0.03759918,  0.06833449,  0.02296839,
       -0.07488368,  0.04037099,  0.05665225,  0.04398576,  0.04187547,
       -0.03628922, -0.06232252, -0.05448715, -0.02692181, -0.01407053,
        0.04664414,  0.07194797, -0.07895999,  0.14737977, -0.04758526,
       -0.00879577, -0.06346764,  0.019658  , -0.06669783,  0.01970591,
       -0.01669369,  0.0222417 , -0.00769459, -0.09634308,  0.0618425 ,
       -0.05436297, -0.02390012, -0.06149265,  0.05074084, -0.05806022,
        0.02791889,  0.04829512, -0.04022908, -0.02422512,  0.0862319 ,
        0.03409816,  0.01779867,  0.02309403, -0.06644662, -0.03263488,
       -0.044539  ,  0.08199021, -0.00557615,  0.02760349, -0.05592245,
       -0.05956724, -0.01627915,  0.0051669 , -0.06797258,  0.0335671 ,
       -0.01497564, -0.02880249, -0.02763807,  0.07394126,  0.02385304,
        0.0124939 ,  0.05325119,  0.01719199, -0.06099785, -0.04