In [1]:
import os
import re
import pickle

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score

In [2]:
cd ../../src

/Users/cock/kDrive/PhD/Projects/Labs/beerslaw-lab/src


# functions

In [3]:
with open('../data/sequenced_simulations/simplestate_secondslstm/id_dictionary.pkl', 'rb') as fp:
    idd = pickle.load(fp)

In [4]:
def crawl_resultspreds_paths(algorithm):
    """
    Crawls the experiment folder in base path, for parameter "algorithm" and pulls out, for eaach length run, 
    the predictions files and the results files
    """
    crawled_paths = []
    for (dirpath, dirnames, filenames) in os.walk('{}{}/'.format(base_path, algorithm)):
        files = [os.path.join(dirpath, file) for file in filenames]
        crawled_paths.extend(files)
    prediction_paths = [pp for pp in crawled_paths if 'predictions.pkl' in pp]
    results_paths = [rr for rr in crawled_paths if 'xval' in rr and 'pkl' in rr]
    return prediction_paths, results_paths

def files_to_dict(paths, algo):
    '''
    Processes the list of files into a dictionary where the ky is the length of the sequence
    for which the file was created, and the value is the read file
    '''
    length_re = re.compile('/([0-9]+)/')
    dics = {}
    for path in paths:
        length = length_re.findall(path)[0]
        with open(path, 'rb') as fp:
            dics[length] = pickle.load(fp)
    return dics

def fill_in_predictions(preds):
    """
    combine all prediction files into 1.
    For each student's username: prediction, probability predictions and truth per length that it was
    predicted on
    """
    full_preds = {}
    full_preds.update(preds['30'])
    for key in ['40', '50', '60']:
        for username in preds[key]:
            if username not in full_preds:
                full_preds[username] = {}
            full_preds[username][int(key)] = preds[key][username][int(key)]
    return full_preds

def collect_highest_length(preds, length, usernames):
    """
    For each length, collects the predictions at this lengths, and if the sequence is too short,
    take the latest predictions (highest length for that specific data point)
    """
    current_predictions = {}
    for username in usernames:
        try:
            current_predictions[username] = preds[username][length]
        except KeyError:
            if username not in preds:
                print(username)
            continue
            ls = list(preds[username].keys())
            ls.sort()
            last_length = ls[-1]
            current_predictions[username] = preds[username][last_length]
    return current_predictions

def compute_scores(cur_preds):
    preds = [cur_preds[user]['pred'] for user in cur_preds]
    probabilities = [cur_preds[user]['proba'][1] for user in cur_preds]
    truths = [cur_preds[user]['truth'] for user in cur_preds]
    
    accuracy = accuracy_score(truths, preds)
    balanced_accuracy = balanced_accuracy_score(truths, preds)
    precision = precision_score(truths, preds)
    recall = recall_score(truths, preds)
    roc = roc_auc_score(truths, probabilities)
    scores = {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy,
        'precision': precision,
        'recall': recall,
        'roc': roc
    }
    return scores
    
def compute_scores_foralllengths(results, full_preds, lengths=range(30, 70, 10)):
    sc = {}
    for length in range(30, 70, 10):
        print('Length: {}'.format(length))
        
        all_usernames = get_usernames_test(results, length, idd)
        for f in range(10):
            cur_preds = collect_highest_length(full_preds, length, all_usernames[f])
            scores = compute_scores(cur_preds)
            
            results[str(length)][f]['carry_on_scores'] = scores
            sc[str(length)] = scores
            
    return results, sc
        
def get_usernames_test(results, length, idd):
    usernames = {}
    for fold in range(10):
        indices = results[str(length)][fold]['test_indices']
        usernames[fold] = [idd['sequences'][idx]['learner_id'] for idx in indices]
    return usernames

        
    


    

# SSAN

In [5]:
base_path = '../experiments/nested early-chem-lab/'

In [10]:
# Find predictions files
prediction_paths, results_paths = crawl_resultspreds_paths('ssan')

# Read the prediction and results files
preds = files_to_dict(prediction_paths, 'ssan')
results = files_to_dict(results_paths, 'ssan')
        
# Fill in the prediction
full_preds = fill_in_predictions(preds)

# Compute scores for all lengths
results, sc = compute_scores_foralllengths(results, full_preds)
    
for length in results:
    path = '{}{}/early_nested_xval_m{}_l{}.pkl'.format(base_path, 'ssan', 'ssan', length)
    with open(path, 'wb') as fp:
        pickle.dump(results[length], fp)

Length: 30
vwfpuqaz
k7p5eryf
x5sm9pfu
uenn9vgu
ur9sxzx7
vqumj3t3
kq4eymtu
n6zakrku
xmh5qd3z
6tg95rzr
dnvedphf
amqnxthw
j6nndaxp
8jp62suc
jpccnav5
qqf6nsga
r42kwnt7
my6csh6m
cb3z5rjs
bjy2zpww
xe7c36dk
beypwbck
p6269reg
9p4h3zdx
pesfpzbv
y9tk3ysm
z6ur4njh
wyj76ntd
h4hxzc6s
982cf4dn
2ep3hayy
s9prcjfd
fs32fqe3
s47edumx
urwu33jd
bje92qp8
z8hvrhwb
wguewwkp
ecaf72fu
a57jkjgv
zy256ycq
mkw5afyy
zbabnwym
b8g49nvw
7ygreyfc
cp7mfn24
qfu6r8c7
rwax4gk7
5xuzjwdp
xuwyv9hz
7ck7mq6a
y8qbtkmf
va43652k
phupma28
3w54b2bu
7zjqat99
85pdk9mq
26z3wbqz
t7728bqp
ntqpqkpq
rdy7sx9b
33asfz2u
tzbaaz7e
qn4qjrvd
urxv4evt
3s6pz8qy
crak75dx
xau6wapw
x3ykresy
552gbupp
ac4q67z9
wvxkvhne
nmgve3yy
rfand3tt
n977pem4
z4nrhe2e
tvtbbfhg
uamuwrx2
p2u7xw38
xw9qt6r4
addf7f7d
npatrq7r
xpyjyx4m
3xcaamq9
v2kw3kup
sr34qyfx
chm4sr6j
6cs3annc
nrxpa2ac
kq2e6dgu
puzxz678
6h5vmwys
9rked2x5
5h9umyr6
gk7r7cee
qevswkvj
6ruh7enb
a6t5p3sb
4k4kc2k6
9p9gwu88
6j6u2yct
bqmcqxyx
krd7m9vb
nemddmx6
vjr7tshm
fw2ajjmt
4ak6da99
xtnmv9qf
9hdhrrat
wpszzhxa

In [11]:
full_preds

{'temu2736': {30: {'pred': 0,
   'proba': array([0.7769135, 0.2230865], dtype=float32),
   'truth': 1}},
 'ujpk3gf4': {30: {'pred': 1,
   'proba': array([0.36407328, 0.63592666], dtype=float32),
   'truth': 0}},
 '2hr6mkdc': {30: {'pred': 1,
   'proba': array([0.48087576, 0.51912427], dtype=float32),
   'truth': 1},
  40: {'pred': 1,
   'proba': array([0.40223423, 0.5977658 ], dtype=float32),
   'truth': 1},
  50: {'pred': 1,
   'proba': array([0.45802236, 0.5419776 ], dtype=float32),
   'truth': 1},
  60: {'pred': 0,
   'proba': array([0.5001175, 0.4998825], dtype=float32),
   'truth': 1}},
 'p44vw7td': {30: {'pred': 1,
   'proba': array([0.41828063, 0.5817194 ], dtype=float32),
   'truth': 1},
  40: {'pred': 1,
   'proba': array([0.3945232 , 0.60547686], dtype=float32),
   'truth': 1},
  50: {'pred': 1,
   'proba': array([0.3697501 , 0.63024986], dtype=float32),
   'truth': 1},
  60: {'pred': 1,
   'proba': array([0.32277894, 0.67722106], dtype=float32),
   'truth': 1}},
 'qsd9cb5e':

In [28]:
base_path = '../experiments/nested early-capacitor-lab/'

In [35]:
def files_to_dict(paths):
    '''
    Processes the list of files into a dictionary where the ky is the length of the sequence
    for which the file was created, and the value is the read file
    '''
    length_re = re.compile('lstm/([0-9]+)/')
    dics = {}
    for path in paths:
        print(path)
        length = length_re.findall(path)[0]
        with open(path, 'rb') as fp:
            dics[length] = pickle.load(fp)
    return dics

In [40]:
def compute_scores_foralllengths(results, full_preds, lengths=range(30, 70, 10)):
    sc = {}
    for length in range(30, 70, 10):
        print('Length: {}'.format(length))
        
        all_usernames = get_usernames_test(results, length, idd)
        for f in range(10):
            print(all_usernames[f])
            cur_preds = collect_highest_length(full_preds, length, all_usernames[f])
            if len(cur_preds) > 0:
                scores = compute_scores(cur_preds)
            else:
                continue
            
            results[str(length)][f]['carry_on_scores'] = scores
            sc[str(length)] = scores
            
    return results, sc

In [41]:
# Find predictions files
prediction_paths, results_paths = crawl_resultspreds_paths('lstm')

# Read the prediction and results files
preds = files_to_dict(prediction_paths)
results = files_to_dict(results_paths)
        
# Fill in the prediction
full_preds = fill_in_predictions(preds)

# Compute scores for all lengths
results, sc = compute_scores_foralllengths(results, full_preds)
    
for length in results:
    path = '{}{}/early_nested_xval_m{}_l{}.pkl'.format(base_path, 'lstm', 'lstm', length)
    with open(path, 'wb') as fp:
        pickle.dump(results[length], fp)

../experiments/nested early-capacitor-lab/lstm/50/capacitor-early-lstm-50/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/predictions.pkl
../experiments/nested early-capacitor-lab/lstm/60/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/predictions.pkl
../experiments/nested early-capacitor-lab/lstm/30/capacitor-early-lstm-30/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/predictions.pkl
../experiments/nested early-capacitor-lab/lstm/40/capacitor-early-lstm-40/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/predictions.pkl
../experiments/nested early-capacitor-lab/lstm/50/capacitor-early-lstm-50/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/early_nested_xval_mlstm_l50.pkl
../experiments/nested early-capacitor-lab/lstm/60/binary_edm2021/2classes/lstm/raw_full/2022_03_03_0/results/early_nested_xval_mlstm_l60.pkl
../experiments/nested early-capacitor-lab/lstm/30/capacitor-early-lstm-30/binary_edm2021/2classes/lstm/raw_full/2022_03_03

In [37]:
# Find predictions files
prediction_paths, results_paths = crawl_resultspreds_paths('lstm')

# Read the prediction and results files
preds = files_to_dict(prediction_paths)
results = files_to_dict(results_paths)
        
# Fill in the prediction
full_preds = fill_in_predictions(preds)

# Compute scores for all lengths
results, sc = compute_scores_foralllengths(results, full_preds)
    
for length in results:
    path = '{}{}/early_nested_xval_m{}_l{}.pkl'.format(base_path, 'lstm', 'lstm', length)
    with open(path, 'wb') as fp:
        pickle.dump(results[length], fp)

{'271097298': {30: {'pred': 0,
   'proba': array([0.7694552 , 0.23054487], dtype=float32),
   'truth': 1},
  40: {'pred': 1,
   'proba': array([0.01592877, 0.98407125], dtype=float32),
   'truth': 1},
  50: {'pred': 1,
   'proba': array([2.3370115e-04, 9.9976629e-01], dtype=float32),
   'truth': 1},
  60: {'pred': 1,
   'proba': array([0.42806748, 0.57193255], dtype=float32),
   'truth': 1}},
 '33100398': {30: {'pred': 1,
   'proba': array([0.2296266, 0.7703734], dtype=float32),
   'truth': 1},
  40: {'pred': 1,
   'proba': array([0.12097234, 0.8790276 ], dtype=float32),
   'truth': 1},
  50: {'pred': 1,
   'proba': array([0.4586232, 0.5413768], dtype=float32),
   'truth': 1},
  60: {'pred': 0,
   'proba': array([0.53766644, 0.4623336 ], dtype=float32),
   'truth': 1}},
 '642389298': {30: {'pred': 0,
   'proba': array([0.8437312 , 0.15626873], dtype=float32),
   'truth': 0},
  40: {'pred': 0,
   'proba': array([0.9422979 , 0.05770213], dtype=float32),
   'truth': 0}},
 '983174198': {30