In [1]:
# import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as python_random


In [2]:
df_all = pd.read_csv('../train_val_test_split/train_val_test_GoogleAudioSet.csv', index_col=0)
df_all

Unnamed: 0,file,source,category,weight,fold
0,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,0
1,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
2,../data/interim/GoogleAudioSet_unbalanced_list...,Google_nature,0,1,5
3,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
4,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,1
...,...,...,...,...,...
13662,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,5
13663,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,3
13664,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8
13665,../data/interim/GoogleAudioSet_unbalanced_list...,Google_city,1,1,8


In [3]:
embedding_raw_matrix = np.load('../data/processed/20230304/embedding_raw_matrix.npy')
embedding_bg_matrix = np.load('../data/processed/20230304/embedding_bg_matrix.npy')
embedding_fg_matrix = np.load('../data/processed/20230304/embedding_fg_matrix.npy')

In [4]:
mps_raw_matrix = np.load('../data/processed/20230304/mps_raw_matrix.npy')
mps_bg_matrix = np.load('../data/processed/20230304/mps_bg_matrix.npy')
mps_fg_matrix = np.load('../data/processed/20230304/mps_fg_matrix.npy')

In [5]:
df_indices_raw = pd.read_csv('../data/processed/20230304/df_indices_raw.csv', index_col=0)
df_indices_bg = pd.read_csv('../data/processed/20230304/df_indices_bg.csv', index_col=0)
df_indices_fg = pd.read_csv('../data/processed/20230304/df_indices_fg.csv', index_col=0)

In [6]:
index_train = df_all['fold'] < 8
index_valid = df_all['fold'] == 8
index_test = df_all['fold'] == 9

# PCA dimensional reduction

In [7]:
def get_transformer(n_components):
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.decomposition import PCA
    from sklearn.impute import SimpleImputer
    transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('PCA', PCA(n_components = n_components, random_state=23)),
        ('scaler', StandardScaler())
    ])
    return transformer

Transform MPS data

In [8]:
def data_pca(data, index_train, index_valid, index_test, n_components):
    train_data = data[index_train]
    valid_data = data[index_valid]
    test_data = data[index_test]

    transformer = get_transformer(n_components = n_components)
    transformer.fit(train_data)
    train_data_pca = transformer.transform(train_data)
    valid_data_pca = transformer.transform(valid_data)
    test_data_pca = transformer.transform(test_data)
    return train_data_pca, valid_data_pca, test_data_pca

In [9]:
train_mps_raw_pca, valid_mps_raw_pca, test_mps_raw_pca = data_pca(mps_raw_matrix, index_train, index_valid, index_test, n_components=10)
train_mps_bg_pca, valid_mps_bg_pca, test_mps_bg_pca = data_pca(mps_bg_matrix, index_train, index_valid, index_test, n_components=10)
train_mps_fg_pca, valid_mps_fg_pca, test_mps_fg_pca = data_pca(mps_fg_matrix, index_train, index_valid, index_test, n_components=10)

In [10]:
train_indices_raw_pca, valid_indices_raw_pca, test_indices_raw_pca = data_pca(df_indices_raw.replace([np.inf, -np.inf], np.nan), index_train, index_valid, index_test, n_components=20)
train_indices_bg_pca, valid_indices_bg_pca, test_indices_bg_pca = data_pca(df_indices_bg.replace([np.inf, -np.inf], np.nan), index_train, index_valid, index_test, n_components=20)
train_indices_fg_pca, valid_indices_fg_pca, test_indices_fg_pca = data_pca(df_indices_fg.replace([np.inf, -np.inf], np.nan), index_train, index_valid, index_test, n_components=20)

In [11]:
train_embedding_raw_pca, valid_embedding_raw_pca, test_embedding_raw_pca = data_pca(embedding_raw_matrix, index_train, index_valid, index_test, n_components=400)
train_embedding_bg_pca, valid_embedding_bg_pca, test_embedding_bg_pca = data_pca(embedding_bg_matrix, index_train, index_valid, index_test, n_components=400)
train_embedding_fg_pca, valid_embedding_fg_pca, test_embedding_fg_pca = data_pca(embedding_bg_matrix, index_train, index_valid, index_test, n_components=400)

In [13]:
y_train = df_all.loc[index_train,'category']
y_valid = df_all.loc[index_valid,'category']
y_test = df_all.loc[index_test,'category']

In [14]:
import pickle
aggregated_data = [train_mps_raw_pca, valid_mps_raw_pca, test_mps_raw_pca,
                  train_mps_bg_pca, valid_mps_bg_pca, test_mps_bg_pca,
                  train_mps_fg_pca, valid_mps_fg_pca, test_mps_fg_pca,
                  train_indices_raw_pca, valid_indices_raw_pca, test_indices_raw_pca,
                  train_indices_bg_pca, valid_indices_bg_pca, test_indices_bg_pca,
                  train_indices_fg_pca, valid_indices_fg_pca, test_indices_fg_pca,
                  train_embedding_raw_pca, valid_embedding_raw_pca, test_embedding_raw_pca,
                  train_embedding_bg_pca, valid_embedding_bg_pca, test_embedding_bg_pca,
                  train_embedding_fg_pca, valid_embedding_fg_pca, test_embedding_fg_pca,
                  y_train, y_valid, y_test]
pickle.dump(aggregated_data, open('../data/processed/20230304/aggregated_data', 'wb' ) )