In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import os

In [2]:
# FIND THE THRESHOLD
def get_thres_fold(gold_labels_train_folds, results_softmax, folds=5):
    T = np.arange(0,1,step=0.001)
    optimal_t = np.zeros(5) # t[i] is the optimal t for fold i
    for i in range(folds):
        gold_labels = gold_labels_train_folds[i]
        train_size = len(gold_labels_train_folds[i])
        LCard_train = gold_labels_train_folds[i].to_numpy().sum()/train_size

        test_size = results_softmax[i].shape[0]
        diff = np.zeros_like(T) # to store differences of label cardinalities
        for (j,t) in enumerate(T):
            binary_df_pred = results_softmax[i]>t
            LCard_test = binary_df_pred.sum()/test_size
            diff[j] = np.abs(LCard_train - LCard_test)

        optimal_t[i] = T[np.argmin(diff)]
    return optimal_t

In [3]:
# FIND THE THRESHOLD
def get_thres_fold_frame(gold_labels_train_folds, results_softmax, folds=5):
    T = np.arange(0,1,step=0.001)
    optimal_t = np.zeros((folds,9)) # t[i] is the optimal t for fold i
    for i in range(folds):
        gold_labels = gold_labels_train_folds[i]
        gold_labels.columns = [str(f+1) for f in np.arange(9)]
        train_size = len(gold_labels_train_folds[i])
        LCard_train_frames = gold_labels.sum(axis=0)/train_size

        test_size = results_softmax[i].shape[0]
        for fold in range(9):
            fold_preds = results_softmax[i][:,fold]
            diff = np.zeros_like(T) # to store differences of label cardinalities
            for (j,t) in enumerate(T):
                binary_df_pred = fold_preds>t
                LCard_test = binary_df_pred.sum()/test_size
                diff[j] = np.abs(LCard_train_frames[fold] - LCard_test)

            optimal_t[i,fold] = T[np.argmin(diff)]

    return optimal_t

In [4]:
def plot_diff_by_thres(diff, T):
    df = pd.DataFrame({"Thresholds":T, "LCard(train) - LCard(Test)_t":diff})
    lineplt = sns.lmplot("Thresholds", "LCard(train) - LCard(Test)_t",data=df, fit_reg=False)
    # Access the Figure
    fig = lineplt.fig 
    # Add a title to the Figure
    fig.suptitle('Absolute diffs of average train and test cardinalities, last fold', fontsize=12)
    # Show the plot
    plt.show()

In [5]:
def get_metrics(optimal_t, results, gold_labels_test_folds, multiclass=False, folds=5):
    metrics_folds = {}
    
    # if multiclass this part is for finding accuracy for the samples that have indeed a single frame
    # so even we're computing metrics for multiclass we have to retrieve the gold labels from the 
    # dataset which includes second labels as well, if available
    if multiclass:
        multilabeled_gold = get_gold_labels(False, None, "dataset", german=False)
    
    
    for i in range(folds):
        
        # gold labels for the fold in label indicator format
        gold_labels = gold_labels_test_folds[i]
        size = len(gold_labels)
#         print(gold_labels)
        if multiclass:
             # create binary results for f1 scores
            binary_df_pred = results[i] > 0 
        else:
             # create binary results for f1 scores
            binary_df_pred = results[i] > optimal_t[i]
#             print(results[i])
        # eliminate frames which either never predicted or never occurred
        never_predicted = np.where(binary_df_pred.sum(axis=0)==0)[0]
#         print("never_predicted_frames: ", never_predicted)

        # eliminate frames which never occured
        never_occurred = np.where(gold_labels.sum(axis=0)==0)[0]
#         print("never_occurred_frames: ", never_occurred)

        label_indices = np.setdiff1d(np.arange(9), np.union1d(never_occurred, never_predicted))
#         print("selected_label_indices: ", label_indices)

        gold_labels_selected = gold_labels.iloc[:,label_indices]
        binary_predictions_selected = binary_df_pred[:,label_indices]

    
        # hamming can be computed over all frames, but we use the selected frames for consistency
        hamming = metrics.hamming_loss(gold_labels_selected, binary_predictions_selected)

        # well-defined f1 scores require a frame to be both predicted and occur
        f1_macro = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='macro')
        f1_micro = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='micro')
        f1_weighted = metrics.f1_score(gold_labels_selected, binary_predictions_selected, average='weighted')
        
        # for auc we use weighted averaging
        auc = metrics.roc_auc_score(gold_labels_selected, results[i][:,label_indices], average = 'weighted')

            
        # compute evaluations about multilabeled frame predictions
        # N/A if multiclass classification
        if multiclass:
            match_multiple = np.nan
            number_multiple = np.nan
            argmax_preds = gold_labels.to_numpy().argmax(1)
            argmax_gold = binary_df_pred.argmax(1) # binary_df_pred already contains one-hot vectors, argmax is still fine
            exact_match = np.sum(argmax_preds == argmax_gold)/size
            
            # single-labeled accuracy
            multiple_frame_articles_bool = np.sum(multilabeled_gold[i], axis=1) > 1.0
            results_single = np.equal(argmax_preds,argmax_gold)[~multiple_frame_articles_bool]
            match_single = np.mean(results_single)
        else:  
            exact_match = np.sum(np.equal(gold_labels, binary_df_pred).all(1))/size
            multiple_frame_articles_bool = np.sum(gold_labels, axis=1) > 1.0
            results_multiple = np.equal(gold_labels, binary_df_pred).loc[multiple_frame_articles_bool]
            number_multiple = len(results_multiple)
            match_multiple = np.sum(results_multiple.all(1))/number_multiple

            # single-labeled accuracy
            results_single = np.equal(gold_labels, binary_df_pred).loc[~multiple_frame_articles_bool]
            number_single = len(results_single)
            match_single = np.sum(results_single.all(1))/number_single

        metrics_folds[i] = {"hamming":hamming,
                            "f1_macro":f1_macro,
                            "f1_micro":f1_micro,
                            "f1_weighted":f1_weighted,
                            "exact_match":exact_match,
                            "auc":auc,
                            "exact_match_multiple":match_multiple,
                            "number_multiple":number_multiple,
                            "exact_match_single":match_single}
    return metrics_folds

In [6]:
def string_for_latex_table(rs):
    s = ""
    rs = rs.astype(str)
    return s + \
        rs['f1_macro'] + '&' + \
        rs['f1_weighted'] + '&' + \
        rs['f1_micro'] + '&' + \
        rs['auc'] + '&' + \
        rs['hamming'] + '&' + \
        rs['exact_match_single'] + '&' + \
        rs['exact_match_multiple'] + '&' + \
        rs['exact_match']

In [68]:
def collate_results(p, thres, multiclass, test_path, train_path, german=False):
    
    # read results
    pickle_in = open(p,"rb")
    results = pickle.load(pickle_in)
    
    # read gold labels for test folds
    gold_labels_test_folds = get_gold_labels(False, train_path, test_path, german)
#     print(gold_labels_test_folds)
    # if multiclass results are from softmax
    if multiclass:
        results_mc = {}
        golds_mc ={}
        print(results.keys())
        for fold in results.keys():
            zeros = np.zeros_like(results[fold])
            zeros[np.arange(zeros.shape[0]), results[fold].argmax(axis=1)] = 1.0
            
            # select the softmax predictions that correspond to gold labels
            results_mc[fold] = zeros * results[fold]
            golds_mc[fold] = gold_labels_test_folds[fold]
            
        # store results back in binarized format
        results = results_mc
        gold_labels_test_folds = golds_mc

        
    if thres == 'fold':
        gold_labels_train_folds = get_gold_labels(True, train_path, test_path, german)
        
        optimal_t = get_thres_fold(gold_labels_train_folds, results)
    elif thres == 'fold_frame':
        gold_labels_train_folds = get_gold_labels(True, train_path, test_path, german)
        optimal_t = get_thres_fold_frame(gold_labels_train_folds, results)
    elif thres == 'sigmoid':
        optimal_t = np.full(9,0.5)
    elif thres == 'multiclass':
        optimal_t = None
    else:
        assert("Thresholding strategy not known.")

    metrics = get_metrics(optimal_t, results, gold_labels_test_folds, multiclass=multiclass)
    s = string_for_latex_table(pd.DataFrame(metrics).mean(axis=1).round(2))
    return s


In [58]:
def get_gold_labels(train, train_path, test_path, german=False):
    """
    german dataset doesn't have folds for dev set.
    """
    gold_labels = {}
    for i in range(5):
        # train or test gold labels, in binarized format
        if train:
            df = pd.read_csv(train_path+'/'+str(i)+ "/train.tsv", header=None, sep='\t').iloc[:,3:]
        else:
            if german:
                df = pd.read_csv(test_path+"/dev.tsv", header=None, sep='\t').iloc[:,3:]
            else:
                df = pd.read_csv(test_path+'/'+str(i)+"/dev.tsv", header=None, sep='\t').iloc[:,3:]
        gold_labels[i] = df
    return gold_labels

In [70]:
def collate_9binary(path):
    results_folds = {}
    for i in range(5):
        # read predictions for frame1 in fold i here to get the dev set size
        p = os.path.join(path, "frame1", str(i), 'predictions.pkl')
        pickle_in = open(p, 'rb')
        predictions = pickle.load(pickle_in)[1]
        results = np.zeros((len(predictions),9))
        results[:,0] = predictions
        for j in range(1,9): # concatenate other predictions column-wise
            p = os.path.join(path, "frame"+str(j+1), str(i), 'predictions.pkl')
            pickle_in = open(p, 'rb')
            results[:,j] = pickle.load(pickle_in)[1]
        results_folds[i] = results
    return results_folds

In [56]:
metric_names = ["Model", "f1-macro", "f1-weighted", "f1-micro", "auc", "hamming", 
                "exact_match_single", "exact_match_multiple", "exact_match_all"]

### Multilabel and Multiclass comparisons

In [45]:
p = ['multilabel_engbert_focal.pkl',
     'multilabel_multibert_focal.pkl',
     'multiclass_engbert_softmaxfocal.pkl',
     'multiclass_multibert_softmaxfocal.pkl']

print(' '.join(metric_names))
# train path is not needed as thresholds will all be 0.5
print(p[0], ": ", collate_results(p[0], thres = 'sigmoid', multiclass=False, test_path='dataset', train_path=None))
# train path is not needed as thresholds will all be 0.5
print(p[1], ": ", collate_results(p[1], thres = 'sigmoid', multiclass=False, test_path='dataset', train_path=None))

print(p[2], ": ", collate_results(p[2], thres = 'multiclass', multiclass=True,
                      test_path="dataset/dataset_multiclass_wide", 
                      train_path="dataset/dataset_multiclass_wide", german=False))
print(p[3], ": ", collate_results(p[3], thres = 'multiclass', multiclass=True,
                      test_path="dataset/dataset_multiclass_wide", 
                      train_path="dataset/dataset_multiclass_wide", german=False))

Model f1-macro f1-weighted f1-micro auc hamming exact_match_single exact_match_multiple exact_match_all
multilabel_engbert_focal.pkl :  0.8&0.85&0.86&0.97&0.04&0.81&0.53&0.74
multilabel_multibert_focal.pkl :  0.75&0.8&0.81&0.95&0.05&0.77&0.48&0.7
multiclass_engbert_softmaxfocal.pkl :  0.77&0.83&0.83&0.91&0.04&0.86&nan&0.83
multiclass_multibert_softmaxfocal.pkl :  0.73&0.79&0.79&0.89&0.05&0.82&nan&0.79


In [12]:
p = 'multilabel_engbert_cased_focal.pkl'
print(p, ": ", collate_results(p, thres = 'sigmoid', 
                                  multiclass=False, test_path='dataset', train_path=None))

multilabel_engbert_cased_focal.pkl :  0.79&0.84&0.85&0.97&0.04&0.79&0.51&0.72


### 9 Binary Classifiers

In [72]:
expname = 'binary_multibert_focal_9binary_classifiers'
p = expname+".pkl"
path = os.path.join("/usr4/cs591/akyurek/linknb", expname)
results_folds = collate_9binary(path)
with open(p,'wb') as f:
    pickle.dump(results_folds, f)
    
# thres is given as sigmoid as we want 0.5 to be the thres
print(p, ": ", collate_results(p, thres='sigmoid', 
                               multiclass=False, test_path='dataset', train_path=None))

binary_multibert_focal_9binary_classifiers.pkl :  0.74&0.82&0.82&0.95&0.05&0.69&0.58&0.66


### Different Softmax Loss Experiments Metrics

In [73]:
#MultiBERT Softmax Loss LCard1 and LCard2
p = ["multilabel_multibert_softmax.pkl",
     "multilabel_multibert_softmax_weighted.pkl",
     "multilabel_multibert_log_normalized_softmax.pkl",
     "multilabel_multibert_normalized_log_softmax.pkl"]

print(' '.join(metric_names))

for i in p:
    s1 = collate_results(i, thres = 'fold', multiclass=False, 
                         test_path='dataset', train_path='dataset')
    s2 = collate_results(i, thres = 'fold_frame', multiclass=False, 
                         test_path='dataset', train_path='dataset')

    print(i, " (LCard-1)", s1)
    print(i, " (LCard-2)", s2)

Model f1-macro f1-weighted f1-micro auc hamming exact_match_single exact_match_multiple exact_match_all
multilabel_multibert_softmax.pkl  (LCard-1) 0.75&0.81&0.82&0.95&0.05&0.77&0.52&0.71
multilabel_multibert_softmax.pkl  (LCard-2) 0.74&0.8&0.8&0.95&0.05&0.76&0.45&0.68
multilabel_multibert_softmax_weighted.pkl  (LCard-1) 0.78&0.82&0.82&0.96&0.05&0.77&0.43&0.69
multilabel_multibert_softmax_weighted.pkl  (LCard-2) 0.78&0.81&0.81&0.96&0.05&0.76&0.45&0.69
multilabel_multibert_log_normalized_softmax.pkl  (LCard-1) 0.64&0.73&0.72&0.88&0.08&0.74&0.06&0.58
multilabel_multibert_log_normalized_softmax.pkl  (LCard-2) 0.63&0.72&0.72&0.88&0.08&0.75&0.07&0.58
multilabel_multibert_normalized_log_softmax.pkl  (LCard-1) 0.75&0.81&0.82&0.95&0.05&0.76&0.51&0.7
multilabel_multibert_normalized_log_softmax.pkl  (LCard-2) 0.75&0.82&0.82&0.95&0.05&0.76&0.49&0.7


### Zero-shot and Few-shot results for German

In [37]:
p = ['multilabel_multibert_german_zero_shot_focal.pkl',
           'multilabel_multibert_german_zero_shot_softmax.pkl',
           'multilabel_multibert_german_zero_shot_softmax_weighted.pkl',
           'multilabel_multibert_german_30_shot_focal.pkl',
           'multilabel_multibert_german_30_shot_softmax.pkl',]
print(' '.join(metric_names))
print(p[0], ": ",collate_results(p[0], thres = 'sigmoid', multiclass=False,
                      test_path="dataset/german", train_path=None, german=True))
print(p[1], ": ",collate_results(p[1], thres = 'fold', multiclass=False,
                      test_path="dataset/german", train_path="dataset", german=True))
print(p[2], ": ",collate_results(p[2], thres = 'fold', multiclass=False,
                      test_path="dataset/german", train_path="dataset", german=True))
print(p[3], ": ",collate_results(p[3], thres = 'sigmoid', multiclass=False,
                      test_path="dataset/german/30shot", train_path=None, german=True))
print(p[4], ": ",collate_results(p[4], thres = 'fold', multiclass=False,
                      test_path="dataset/german/30shot", train_path="dataset", german=True))

Model f1-macro f1-weighted f1-micro auc hamming exact_match_single exact_match_multiple exact_match_all
multilabel_multibert_german_zero_shot_focal.pkl :  0.53&0.66&0.69&0.87&0.12&0.46&0.34&0.39
multilabel_multibert_german_zero_shot_softmax.pkl :  0.5&0.66&0.68&0.85&0.12&0.58&0.29&0.42
multilabel_multibert_german_zero_shot_softmax_weighted.pkl :  0.52&0.58&0.59&0.84&0.16&0.45&0.17&0.29
multilabel_multibert_german_30_shot_focal.pkl :  0.72&0.78&0.77&0.95&0.12&0.61&0.38&0.46
multilabel_multibert_german_30_shot_softmax.pkl :  0.63&0.69&0.72&0.93&0.14&0.65&0.21&0.37


### Exploring the translation in both directions

In [38]:
pickles = ['multilabel_multibert_focal_train_en2de_test_de.pkl',
           'multilabel_multibert_softmax_train_en2de_test_de.pkl',
           'multilabel_multibert_softmax_weighted_train_en2de_test_de.pkl',
           'multilabel_multibert_focal_train_en_test_de2en.pkl',
           'multilabel_multibert_softmax_train_en_test_de2en.pkl',
           'multilabel_multibert_softmax_weighted_train_en_test_de2en.pkl',
           'multilabel_engbert_focal_train_en_test_de2en.pkl',
           'multilabel_engbert_softmax_train_en_test_de2en.pkl',
           'multilabel_engbert_softmax_weighted_train_en_test_de2en.pkl']

print(collate_results(pickles[0], thres = 'sigmoid', multiclass=False, 
                      test_path="dataset/german/TrainEnglish2German/0", 
                      train_path=None, german=True))
print(collate_results(pickles[1], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TrainEnglish2German/0", 
                      train_path="dataset/german/TrainEnglish2German", german=True))
print(collate_results(pickles[2], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TrainEnglish2German/0", 
                      train_path="dataset/german/TrainEnglish2German", german=True))


print(collate_results(pickles[3], thres = 'sigmoid', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path=None, german=True))
print(collate_results(pickles[4], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path="dataset/german/TestGerman2English", german=True))
print(collate_results(pickles[5], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path="dataset/german/TestGerman2English", german=True))


print(collate_results(pickles[6], thres = 'sigmoid', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path=None, german=True))
print(collate_results(pickles[7], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path="dataset/german/TestGerman2English", german=True))
print(collate_results(pickles[8], thres = 'fold', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path="dataset/german/TestGerman2English", german=True))

0.59&0.69&0.68&0.88&0.12&0.75&0.27&0.48
0.55&0.69&0.7&0.88&0.12&0.74&0.28&0.48
0.52&0.6&0.59&0.87&0.15&0.72&0.23&0.44
0.6&0.72&0.75&0.9&0.1&0.77&0.4&0.56
0.59&0.71&0.74&0.88&0.1&0.77&0.38&0.55
0.6&0.71&0.73&0.89&0.1&0.75&0.36&0.53
0.63&0.75&0.78&0.91&0.09&0.74&0.47&0.59
0.58&0.72&0.75&0.89&0.1&0.77&0.37&0.55
0.61&0.74&0.76&0.9&0.1&0.77&0.39&0.56


In [10]:
p = "multilabel_engbert_cased_focal_train_en_test_de2en.pkl"

print(collate_results(p, thres = 'sigmoid', multiclass=False, 
                      test_path="dataset/german/TestGerman2English", 
                      train_path=None, german=True))

0.58&0.72&0.75&0.9&0.1&0.72&0.41&0.54


## Appendix

In [6]:
def collate_4binary(path):
    results_folds = {}
    golds_folds = {}
    for i in range(5):
        # read predictions for frame1 in fold i here to get the dev set size
        p = os.path.join(path, "frame1", str(i), 'predictions.pkl')
        pickle_in = open(p, 'rb')
        preds_golds = pickle.load(pickle_in)
        
        preds = preds_golds[1]
        golds = preds_golds[2]
        size = len(preds)
        results = np.zeros((size,4))
        golds = np.zeros((size,4))
        results[:,0] = preds
#         golds[:,0] = golds
        
        for j,frame in enumerate([2,4,5]): # concatenate other predictions column-wise
            p = os.path.join(path, "frame"+str(frame), str(i), 'predictions.pkl')
            pickle_in = open(p, 'rb')
            preds_golds = pickle.load(pickle_in)
            results[:,j] = preds_golds[1]
#             golds[:,j] = preds_golds[2]
            
        results_folds[i] = results
        golds_folds[i] = golds
        
    return results_folds, golds_folds

### 4 Binary Classifiers

In [173]:
def get_4binary_metrics(expname):
    path = os.path.join("/usr4/cs591/akyurek/linknb", expname)
    # results_folds, golds_folds = collate_4binary(path)

    m = {} # metrics
    d = {} # distributions
    for frame in [1,2,4,5]:
        
        precisions = np.zeros(5)
        recalls = np.zeros(5)
        dists = {}
        for i in range(5):
            p = os.path.join(path, "frame"+str(frame), str(i), 'predictions.pkl')
            pickle_in = open(p, 'rb')
            preds_golds = pickle.load(pickle_in)
            
            preds = preds_golds[1]>0.5 
            golds = preds_golds[2] # binary

            # compute precision and recall
            never_predicted = preds.sum()==0
            if never_predicted:
                print("Frame %d in fold %d is never predicted." % (frame, i) )

            p = metrics.precision_score(golds, preds)
            r = metrics.recall_score(golds, preds) 

            precisions[i] = np.nan if never_predicted else p
            recalls[i] = r
            # compute data distribution 1 / (1+0)
            dists[i] = golds.mean().round(2)
        
        d[frame] = dists
        m[frame] = {'precision':np.nanmean(precisions).round(2),
                    'recall':recalls.mean().round(2)}
    return m,d


In [174]:
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

Unnamed: 0,precision,recall
1,0.76,0.78
2,0.85,0.92
4,0.63,0.55
5,0.73,0.67


Unnamed: 0,0,1,2,3,4
1,0.02,0.03,0.03,0.02,0.03
2,0.12,0.14,0.11,0.14,0.13
4,0.03,0.03,0.04,0.03,0.03
5,0.07,0.07,0.07,0.08,0.07


In [168]:
print('epochs 9')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 9


Unnamed: 0,precision,recall
5,0.72727,0.67


Unnamed: 0,0
5,0.07


In [163]:
print('epochs 8')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 8


Unnamed: 0,precision,recall
5,0.68571,0.67


Unnamed: 0,0
5,0.07


In [158]:
print('epochs 6')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 6


Unnamed: 0,precision,recall
5,0.67647,0.64


Unnamed: 0,0
5,0.07


In [161]:
print('epochs 8')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 8


Unnamed: 0,precision,recall
4,0.6,0.6


Unnamed: 0,0
4,0.03


In [165]:
print('epochs 7')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 7


Unnamed: 0,precision,recall
4,0.58824,0.67


Unnamed: 0,0
4,0.03


In [156]:
print('epochs 6')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 6


Unnamed: 0,precision,recall
4,0.61111,0.73


Unnamed: 0,0
4,0.03


In [154]:
print('epochs 7')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 7


Unnamed: 0,precision,recall
1,0.7,0.58


Unnamed: 0,0
1,0.02


In [153]:
print('epochs 9')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 9


Unnamed: 0,precision,recall
1,0.7,0.58


Unnamed: 0,0
1,0.02


In [151]:
print('epochs 6 frame 2')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 6 frame 2


Unnamed: 0,precision,recall
2,0.77778,0.93


Unnamed: 0,0
2,0.12


In [150]:
print('epochs 7 frame 2')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 7 frame 2


Unnamed: 0,precision,recall
2,0.78261,0.9


Unnamed: 0,0
2,0.12


In [149]:
print('epochs 9 frame 1')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

epochs 9


Unnamed: 0,precision,recall
2,0.71429,0.92


Unnamed: 0,0
2,0.12


In [142]:
print('epochs 8')
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

Unnamed: 0,precision,recall
1,0.7,0.58
2,0.71795,0.93


Unnamed: 0,0
1,0.02
2,0.12


In [30]:
expname = 'binary_engbert_focal_4binary_classifiers'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T)

Frame 1 in fold 0 is never predicted.


  'precision', 'predicted', average, warn_for)


Unnamed: 0,precision,recall
1,0.08,0.02
2,0.12,0.06
4,0.72,0.56
5,0.76,0.67


Unnamed: 0,0,1,2,3,4
1,0.02,0.03,0.03,0.02,0.03
2,0.12,0.14,0.11,0.14,0.13
4,0.03,0.03,0.04,0.03,0.03
5,0.07,0.07,0.07,0.08,0.07


In [114]:
# evaluating 9binary multiberts on our test set
expname = 'binary_multibert_focal_9binary_classifiers_for_4_frames'
m,d = get_4binary_metrics(expname)
display(pd.DataFrame(m).T)
display(pd.DataFrame(d).T.mean(axis=1))

Unnamed: 0,precision,recall
1,0.86,0.69
2,0.82,0.97
4,0.36,0.82
5,0.59,0.82


1    0.026
2    0.128
4    0.032
5    0.072
dtype: float64

In [109]:
# get the distributions of classes in a set
def get_data_dists(path, train):
    dists = {}
    ratio_99_list = np.zeros(5)
    for i in range(5):
        file = 'train.tsv' if train else 'dev.tsv'
        filename = os.path.join(path, str(i), file)
        df = pd.read_csv(filename, sep='\t', header=None)
        df = df[np.arange(3,12)]
        df.columns = np.arange(1,10)
        dists[i] = df.mean(axis=0).round(2)
        ratio_99 = (df.sum(axis=1)==0).sum()/len(df)
        ratio_99_list[i]= ratio_99
        print("Number of non-thematic frames in %s for fold %d is %.4f" % (file,i,ratio_99))
    return dists, ratio_99_list

In [111]:
#using frame1 since data under all frame folders are the same
path = os.path.join("/usr4/cs591/akyurek/link/transformers/lei/dataset/frame1")

dists, ratio_99_list = get_data_dists(path, False)
print("Mean ratio_99 over folds is %.4f" % ratio_99_list.mean())
display(pd.DataFrame(dists))


Number of non-thematic frames in dev.tsv for fold 0 is 0.4922
Number of non-thematic frames in dev.tsv for fold 1 is 0.4931
Number of non-thematic frames in dev.tsv for fold 2 is 0.4931
Number of non-thematic frames in dev.tsv for fold 3 is 0.4960
Number of non-thematic frames in dev.tsv for fold 4 is 0.4980
Mean ratio_99 over folds is 0.4945


Unnamed: 0,0,1,2,3,4
1,0.02,0.03,0.03,0.02,0.03
2,0.12,0.14,0.11,0.14,0.13
3,0.17,0.17,0.17,0.17,0.17
4,0.03,0.03,0.04,0.03,0.03
5,0.07,0.07,0.07,0.08,0.07
6,0.05,0.04,0.05,0.05,0.04
7,0.1,0.1,0.11,0.1,0.1
8,0.02,0.02,0.02,0.02,0.02
9,0.04,0.03,0.04,0.04,0.04


In [113]:
#using frame1 since data under all frame folders are the same
path = os.path.join("/usr4/cs591/akyurek/link/transformers/lei/dataset/frame1")

dists, ratio_99_list = get_data_dists(path, True)
print("Mean ratio_99 over folds is %.4f" % ratio_99_list.mean())
display(pd.DataFrame(dists))

Number of non-thematic frames in train.tsv for fold 0 is 0.4762
Number of non-thematic frames in train.tsv for fold 1 is 0.4723
Number of non-thematic frames in train.tsv for fold 2 is 0.4815
Number of non-thematic frames in train.tsv for fold 3 is 0.4723
Number of non-thematic frames in train.tsv for fold 4 is 0.4815
Mean ratio_99 over folds is 0.4768


Unnamed: 0,0,1,2,3,4
1,0.04,0.04,0.04,0.04,0.04
2,0.13,0.13,0.14,0.14,0.12
3,0.17,0.18,0.17,0.17,0.18
4,0.03,0.03,0.03,0.03,0.03
5,0.08,0.07,0.07,0.07,0.07
6,0.04,0.06,0.05,0.05,0.05
7,0.11,0.1,0.1,0.11,0.1
8,0.02,0.02,0.02,0.02,0.02
9,0.04,0.04,0.04,0.04,0.05


In [14]:
def plot_distribution_predictions(results):
    # draw the distribution of predicted probabilities
    df = pd.DataFrame(results)
    df.columns = [str(i+1) for i in np.arange(9)]
    ax1 = df.plot.line(xlim = (0,100))
    ax1.set_ylabel("Predicted Probabilities")
    ax1.set_xlabel("Test Samples")
    ax2 = df.plot.density(xlim=(-.2,.3), ylim=(0,85))
    ax2.set_xlabel("Predicted Probabilities")

In [None]:
# paired t test
from scipy import stats
stats.ttest_rel(rvs1,rvs2)