In [3]:
import sys
sys.path.insert(0, '../../src')
from utils import preprocession as prep
from utils.metrics import compute_metric, add_to_metrics, columns

import os
import pickle
import pandas as pd
import numpy as np

### Baselines

In [11]:
def evaluateBaseline(folder_name, data_name):

    train, test = prep.load_data(folder_name, data_name)
    _, train_reactions = train
    _, test_reactions = test
    
    metrics = []
    for p in train_reactions.keys():
        train_predictions = prep.impute_dataframe(train_reactions[p])
        train_result = compute_metric(train_predictions, train_reactions[0], train_reactions[p].isna(), silent=True)
        add_to_metrics(metrics, train_result, [folder_name, data_name, 'Train', p, 'Train', p, 'Baseline', 'Mean'])

        test_list = test_reactions.keys() if p == 0 else [p]
        for q in test_list:
            test_predictions  = prep.impute_dataframe(test_reactions[q], mean=train_predictions.mean())
            test_result = compute_metric(test_predictions, test_reactions[0], test_reactions[q].isna(), silent=True)
            add_to_metrics(metrics, test_result, [folder_name, data_name, 'Train', p, 'Test', q, 'Baseline', 'Mean'])

    metrics = pd.DataFrame(metrics, columns=columns)
    return metrics

## Datasets

### Smartvote Original

In [12]:
metrics_1 = evaluateBaseline(folder_name = 'Smartvote', data_name = 'Original')
metrics_1

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
0,Smartvote,Original,Train,60,Train,60,Baseline,Mean,Fit,1.000000,3.932887e-18
1,Smartvote,Original,Train,60,Train,60,Baseline,Mean,Impute,0.668941,3.663166e-01
2,Smartvote,Original,Train,60,Train,60,Baseline,Mean,Overall,0.801365,2.197899e-01
3,Smartvote,Original,Train,60,Test,60,Baseline,Mean,Fit,1.000000,3.822369e-18
4,Smartvote,Original,Train,60,Test,60,Baseline,Mean,Impute,0.669502,3.669851e-01
...,...,...,...,...,...,...,...,...,...,...,...
82,Smartvote,Original,Train,50,Train,50,Baseline,Mean,Impute,0.673389,3.639723e-01
83,Smartvote,Original,Train,50,Train,50,Baseline,Mean,Overall,0.834517,1.844126e-01
84,Smartvote,Original,Train,50,Test,50,Baseline,Mean,Fit,1.000000,3.584899e-18
85,Smartvote,Original,Train,50,Test,50,Baseline,Mean,Impute,0.663884,3.662570e-01


### Smartvote Binary

In [14]:
metrics_2 = evaluateBaseline(folder_name = 'Smartvote', data_name = 'Binary')
metrics_2.sample(5)

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
61,Smartvote,Binary,Train,30,Test,30,Baseline,Mean,Impute,0.681034,0.453742
75,Smartvote,Binary,Train,90,Train,90,Baseline,Mean,Fit,1.0,0.0
23,Smartvote,Binary,Train,0,Test,20,Baseline,Mean,Overall,0.936368,0.090811
15,Smartvote,Binary,Train,10,Test,10,Baseline,Mean,Fit,1.0,0.0
42,Smartvote,Binary,Train,0,Test,0,Baseline,Mean,Fit,1.0,0.0


### Synthetic Multiclass

In [15]:
metrics_3 = evaluateBaseline(folder_name = 'Synthetic_60_50', data_name = 'Original')
metrics_3.sample(5)

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
1,Synthetic_60_50,Original,Train,60,Train,60,Baseline,Mean,Impute,0.65625,0.3472185
20,Synthetic_60_50,Original,Train,0,Train,0,Baseline,Mean,Overall,1.0,2.430264e-18
62,Synthetic_60_50,Original,Train,30,Test,30,Baseline,Mean,Overall,0.894667,0.1066601
4,Synthetic_60_50,Original,Train,60,Test,60,Baseline,Mean,Impute,0.646111,0.3559876
64,Synthetic_60_50,Original,Train,40,Train,40,Baseline,Mean,Impute,0.650417,0.3466178


### Synthetic Binary

In [16]:
metrics_4 = evaluateBaseline(folder_name = 'Synthetic_60_50', data_name = 'Binary')
metrics_4.sample(5)

Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
54,Synthetic_60_50,Binary,Train,20,Test,20,Baseline,Mean,Fit,1.0,0.0
2,Synthetic_60_50,Binary,Train,60,Train,60,Baseline,Mean,Overall,0.795333,0.274989
77,Synthetic_60_50,Binary,Train,90,Train,90,Baseline,Mean,Overall,0.67725,0.423066
5,Synthetic_60_50,Binary,Train,60,Test,60,Baseline,Mean,Overall,0.789667,0.277867
39,Synthetic_60_50,Binary,Train,0,Test,60,Baseline,Mean,Fit,1.0,0.0


In [18]:
pd.concat([metrics_1,metrics_2,metrics_3,metrics_4]).to_csv('../../results/baseline_metrics.csv')