In [1]:
import sys
sys.path.insert(0, '../../src')
from utils import preprocession as prep
from utils.metrics import compute_metric, add_to_metrics, columns

import os
import pickle
import pandas as pd
import numpy as np

from sklearn import decomposition as mf

from tqdm.notebook import tqdm
tqdm.pandas()

## PCA with Matrix Factorization

In [2]:
def evaluatePCA(folder_name, data_name):

    train, test = prep.load_data(folder_name, data_name)
    _, train_reactions = train
    _, test_reactions = test

    method_path = f"../../embeddings/PCA/{folder_name}/{data_name}"
    output_path = f"../../embeddings/PCA/{folder_name}/{data_name}/MF"
    os.makedirs(output_path, exist_ok=True)

    metrics = []
    for p in tqdm(train_reactions.keys()):
        train_data = prep.impute_dataframe(train_reactions[p])
        pca = mf.PCA(n_components=2).fit(train_data)
        with open(f'{method_path}/PCA_{p}.pkl', 'wb') as file:
            pickle.dump(pca, file)

        X = pd.DataFrame(pca.transform(train_data),
                         index=train_reactions[p].index, columns=['x','y'])
        X.to_csv(f'{method_path}/train_embedding_{p}.csv')

        train_predictions = pd.DataFrame(pca.inverse_transform(X.values),
                                index=train_data.index, columns=train_data.columns)
        train_predictions.to_csv(f'{output_path}/train_predictions_{p}.csv')
        train_result = compute_metric(train_predictions, train_reactions[0], train_reactions[p].isna(), silent=True)
        add_to_metrics(metrics, train_result, [folder_name, data_name, 'Train', p, 'Train', p, 'PCA', 'Matrix Factorization'])

        test_list = test_reactions.keys() if p == 0 else [p]
        for q in test_list:
            test_data  = prep.impute_dataframe(test_reactions[q], mean=train_data.mean())

            Y = pd.DataFrame(pca.transform(test_data), 
                             index=test_reactions[p].index, columns=['x','y'])
            Y.to_csv(f'{method_path}/test_embedding_{p}_{q}.csv')

            test_predictions  = pd.DataFrame(pca.inverse_transform(Y.values),
                                             index=test_data.index, columns=test_data.columns)
            test_predictions.to_csv(f'{output_path}/test_predictions_{p}_{q}.csv')

            test_result = compute_metric(test_predictions, test_reactions[0], test_reactions[q].isna(), silent=True)
            add_to_metrics(metrics, test_result, [folder_name, data_name, 'Train', p, 'Test', q, 'PCA', 'Matrix Factorization'])

    metrics = pd.DataFrame(metrics, columns=columns)
    metrics.to_csv(f'{output_path}/metrics.csv')
    return metrics

## Datasets

### Smartvote Original

In [3]:
metrics = evaluatePCA(folder_name = 'Smartvote', data_name = 'Original')
metrics.sample(5)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
71,Smartvote,Original,Train,80,Train,80,PCA,Matrix Factorization,Overall,0.725442,0.333859
18,Smartvote,Original,Train,0,Train,0,PCA,Matrix Factorization,Fit,0.814262,0.260816
5,Smartvote,Original,Train,60,Test,60,PCA,Matrix Factorization,Overall,0.770253,0.304135
82,Smartvote,Original,Train,50,Train,50,PCA,Matrix Factorization,Impute,0.778198,0.296063
65,Smartvote,Original,Train,40,Train,40,PCA,Matrix Factorization,Overall,0.795101,0.281849


### Smartvote Binary

In [4]:
metrics = evaluatePCA(folder_name = 'Smartvote', data_name = 'Binary')
metrics.sample(5)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
74,Smartvote,Binary,Train,80,Test,80,PCA,Matrix Factorization,Overall,0.731494,0.424833
45,Smartvote,Binary,Train,0,Test,70,PCA,Matrix Factorization,Fit,0.774813,0.399551
32,Smartvote,Binary,Train,0,Test,80,PCA,Matrix Factorization,Overall,0.737149,0.422735
44,Smartvote,Binary,Train,0,Test,0,PCA,Matrix Factorization,Overall,0.825793,0.352643
67,Smartvote,Binary,Train,40,Test,40,PCA,Matrix Factorization,Impute,0.797356,0.379341


### Synthetic Multiclass

In [5]:
metrics = evaluatePCA(folder_name = 'Synthetic_60_50', data_name = 'Original')
metrics.sample(5)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
60,Synthetic_60_50,Original,Train,30,Test,30,PCA,Matrix Factorization,Fit,0.941429,0.168581
85,Synthetic_60_50,Original,Train,50,Test,50,PCA,Matrix Factorization,Impute,0.886,0.225179
73,Synthetic_60_50,Original,Train,80,Test,80,PCA,Matrix Factorization,Impute,0.744167,0.310785
77,Synthetic_60_50,Original,Train,90,Train,90,PCA,Matrix Factorization,Overall,0.68875,0.330956
75,Synthetic_60_50,Original,Train,90,Train,90,PCA,Matrix Factorization,Fit,0.7575,0.29813


### Synthetic Binary

In [6]:
metrics = evaluatePCA(folder_name = 'Synthetic_60_50', data_name = 'Binary')
metrics.sample(5)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
85,Synthetic_60_50,Binary,Train,50,Test,50,PCA,Matrix Factorization,Impute,0.892,0.329213
81,Synthetic_60_50,Binary,Train,50,Train,50,PCA,Matrix Factorization,Fit,0.906667,0.311505
39,Synthetic_60_50,Binary,Train,0,Test,60,PCA,Matrix Factorization,Fit,0.884167,0.334888
64,Synthetic_60_50,Binary,Train,40,Train,40,PCA,Matrix Factorization,Impute,0.901042,0.310813
71,Synthetic_60_50,Binary,Train,80,Train,80,PCA,Matrix Factorization,Overall,0.776,0.405007
