In [165]:
from os import getcwd
from os.path import join, dirname
import pandas as pd
import pickle
import string 
import numpy as np
import spacy
import re

PATH_REPO = dirname(getcwd())
PATH_DATA = join(PATH_REPO, 'data')
PATH_MODELS = join(PATH_REPO, 'models')
PATH_UTILS = join(PATH_REPO, 'utils')

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append(PATH_UTILS)

from evaluate import evaluate
from classify import get_predictions

# 1. Get Testing set

In [166]:
gold = pd.read_csv(join(PATH_DATA, f'test.csv'))
y_true = gold.loc[:, 'category'].values
len(gold)

10000

In [167]:
probas={}
probas['Random']= {}
n = 100000
for i in range(n):
    probas['Random'][i] = np.random.randint(0, 2, len(gold))
probas['Random'] = {'0': np.mean([probas['Random'][i] for i in range(n)], axis=0)}

In [168]:
probas['bow'] = {}

train = pd.read_csv(join(PATH_DATA, 'train_.csv'))
vectorizer = TfidfVectorizer( ngram_range=(1, 3), min_df=2, stop_words='english')
X = vectorizer.fit_transform(train.text_pp.values)
y = train.category.values

X, y = shuffle(X, y)

clf = LogisticRegression(random_state=0, class_weight='balanced').fit(X, y)
probas['bow'][0] = clf.predict_proba(vectorizer.transform(gold.loc[:, 'text_pp'].values))[:, 1]

# 2. FSL

In [173]:
probas['FSL']= {}
n=15
for i in range(n):
    v = pd.read_csv(join(PATH_DATA, f'FSL_blog_{i}_preds.csv'))
    probas['FSL'][i] = v.loc[:, 'prob_hatred'].values
    
probas['FSL_mean'] = {'0': np.mean([probas['FSL'][i] for i in range(n)], axis=0)}

In [174]:
probas['FSL_MINI']= {}
n=15
for i in range(n):
    v = pd.read_csv(join(PATH_DATA, f'FSL_blog_MINI_{i}_preds.csv'))
    probas['FSL_MINI'][i] = v.loc[:, 'prob_hatred'].values
    
probas['FSL_MINI_mean'] = {'0': np.mean([probas['FSL_MINI'][i] for i in range(n)], axis=0)}

# 3. Distill

In [175]:
probas['distill']= {}
v = pd.read_csv(join(PATH_DATA, f'FSL_mean_distill_preds.csv'))
probas['distill'][0] = v.loc[:, 'prob_hatred'].values

probas['distill_MINI']= {}
v = pd.read_csv(join(PATH_DATA, f'FSL_mean_distill_MINI_preds.csv'))
probas['distill_MINI'][0] = v.loc[:, 'prob_hatred'].values

# 4. Results of general metrics

In [176]:
title =   '|Models| F1 | P | R |'
subhead = '|:-----:|:--:|:-:|:-:|'

print(title)
print(subhead)

recalls = {key: {} for key in probas.keys()}
precisions = {key: {} for key in probas.keys()}
f1s = {key: {} for key in probas.keys()}
rs, ps, fs= {key: [] for key in probas.keys()}, {key: [] for key in probas.keys()}, {key: [] for key in probas.keys()}

for key, value in probas.items():
    for _key, _value in value.items():
        y_preds = get_predictions(_value, p=0.5)
        r, p, f= [], [], []
        metric = evaluate(y_true, y_preds)
        r.append(metric.get('recall'))
        p.append(metric.get('precision'))
        f.append(metric.get('f1'))
        #here its the macro R and macro P (so macro F) for type_model (key) and for the ith model (_key)
        rs[key].append(np.mean(r))
        ps[key].append(np.mean(p))
        f = 2 * np.mean(r) * np.mean(p) / (np.mean(r) + np.mean(p)) if (np.mean(r)+np.mean(p)) > 0 else 0
        fs[key].append(f)
    #here its the median of the macro R and macro P (and macro F)
    recalls[key]['median'], recalls[key]['std'] = np.median(rs[key]), np.std(rs[key])
    precisions[key]['median'], precisions[key]['std'] = np.median(ps[key]), np.std(ps[key])
    f1s[key]['median'] = 2 * precisions[key]['median'] * recalls[key]['median'] / \
                         (precisions[key]['median'] + recalls[key]['median'])
    f1s[key]['std'] = np.std(fs[key])
    print('|{}|{:.2f}({:.2f})|{:.2f}({:.2f})|{:.2f}({:.2f})|'.format(key, np.median(fs[key]),
                                                                     np.std(fs[key]),
                                                                     np.median(ps[key]),
                                                                     np.std(ps[key]),
                                                                     np.median(rs[key]),
                                                                     np.std(rs[key])))

|Models| F1 | P | R |
|:-----:|:--:|:-:|:-:|
|Random|50.73(0.00)|50.44(0.00)|51.02(0.00)|
|bow|56.40(0.00)|63.07(0.00)|51.00(0.00)|
|FSL|64.41(4.65)|65.34(2.87)|64.80(10.39)|
|FSL_mean|67.86(0.00)|67.75(0.00)|67.98(0.00)|
|FSL_MINI|63.05(1.40)|63.70(1.69)|62.46(4.00)|
|FSL_MINI_mean|63.87(0.00)|64.56(0.00)|63.20(0.00)|
|distill|68.34(0.00)|67.58(0.00)|69.12(0.00)|
|distill_MINI|64.15(0.00)|64.52(0.00)|63.78(0.00)|


|Models| F1 | P | R |
|:-----:|:--:|:-:|:-:|
|Random|50.73(0.00)|50.44(0.00)|51.02(0.00)|
|bow|56.40(0.00)|63.07(0.00)|51.00(0.00)|
|FSL|64.41(4.65)|65.34(2.87)|64.80(10.39)|
|FSL_mean|67.86(0.00)|67.75(0.00)|67.98(0.00)|
|FSL_MINI|63.05(1.40)|63.70(1.69)|62.46(4.00)|
|FSL_MINI_mean|63.87(0.00)|64.56(0.00)|63.20(0.00)|
|distill|68.34(0.00)|67.58(0.00)|69.12(0.00)|
|distill_MINI|64.15(0.00)|64.52(0.00)|63.78(0.00)|
