In [15]:
import sys
sys.path.insert(0, '../../src')
from utils import preprocession as prep
from utils.metrics import compute_metric, add_to_metrics, columns
from utils import visualization as vis
from models.SVM import train_SVM_models, train_LR_models, predict_from_models

import pickle
import os
import pandas as pd
import numpy as np
import itertools


## Logistic Regression

- Load embeddings with naming scheme
- Fit Logistic Regression
- Predict/ Decode embedding
- Compare with Ground Truth
- Generate Test Embedding
- Predict and Compare again
- save results.csv (and everything) to /embeddings/Dataset/Datatype/Encoder/Decoder/....


In [10]:
def evaluateLR(folder_name, data_name, method):
    train, test = prep.load_data(folder_name, data_name)
    _, train_reactions = train
    _, test_reactions = test

    method_path = f"../../embeddings/{method}/{folder_name}/{data_name}"
    output_path = f"../../embeddings/{method}/{folder_name}/{data_name}/LR"
    os.makedirs(output_path, exist_ok=True)

    metrics = []
    for p in range(0,100,10):
        train_embedding = pd.read_csv(f'{method_path}/train_embedding_{p}.csv', index_col=0)

        LR_models = train_LR_models(train_embedding.values, train_reactions[p])
        with open(f'{output_path}/models_{p}.pkl', 'wb') as file:
            pickle.dump(LR_models, file)

        train_predictions = pd.DataFrame(predict_from_models(LR_models, train_embedding.values),
                                    index=train_embedding.index, columns=train_reactions[p].columns)
        train_predictions.to_csv(f'{output_path}/train_predictions_{p}.csv')

        train_result = compute_metric(train_predictions, train_reactions[0], train_reactions[p].isna(), silent=True)
        add_to_metrics(metrics, train_result, [folder_name, data_name, 'Train', p, 'Train', p, method, 'LR'])

        test_list = test_reactions.keys() if p == 0 else [p]
        for q in test_list:
            test_embedding = pd.read_csv(f'{method_path}/test_embedding_{p}_{q}.csv', index_col=0)

            test_predictions = pd.DataFrame(predict_from_models(LR_models, test_embedding.values),
                                            index=test_reactions[q].index, columns=test_reactions[q].columns)
            test_predictions.to_csv(f'{output_path}/test_predictions_{p}_{q}.csv')

            test_result = compute_metric(test_predictions, test_reactions[0], test_reactions[q].isna(), silent=True)
            add_to_metrics(metrics, test_result, [folder_name, data_name, 'Train', p, 'Test', q, method, 'LR'])

    metrics = pd.DataFrame(metrics, columns=columns)
    metrics.to_csv(f'{output_path}/metrics.csv')
    return metrics

In [16]:
datasets  = ['Smartvote', 'Synthetic_60_50']
datatypes = ['Original', 'Binary']
methods   = ['PCA', 'TSNE']

for folder_name, data_name, method in itertools.product(datasets, datatypes, methods):
    metrics = evaluateLR(folder_name, data_name, method)

metrics.head()


Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
0,Synthetic_60_50,Binary,Train,0,Train,0,TSNE,LR,Fit,0.97175,0.142899
1,Synthetic_60_50,Binary,Train,0,Train,0,TSNE,LR,Impute,,
2,Synthetic_60_50,Binary,Train,0,Train,0,TSNE,LR,Overall,0.97175,0.142899
3,Synthetic_60_50,Binary,Train,0,Test,20,TSNE,LR,Fit,0.965,0.159694
4,Synthetic_60_50,Binary,Train,0,Test,20,TSNE,LR,Impute,0.963333,0.171679
