In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from gensim import corpora, models
from gensim.models import word2vec

from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from utils import *

import sklearn
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier

In [71]:
train = pd.read_csv('train.tsv',sep='\t',header=None,dtype={0:str,1:str})
dev = pd.read_csv('dev.tsv',sep='\t',header=None,dtype={0:str,1:str})
test = pd.read_csv('test.tsv',sep='\t',header=None,dtype={0:str,1:str})

In [72]:
def vectorize_labels(df):
    y = []

    for label in df[0]:
        label_vec = []

        for cat in label:
            label_vec.append(int(cat))


        y.append(np.array(label_vec))
        
    return y

In [52]:
def train_svm(train_file):
    train = pd.read_csv(train_file,sep='\t',header=None,dtype={0:str,1:str})
    dev = pd.read_csv('dev.tsv',sep='\t',header=None,dtype={0:str,1:str})
    test = pd.read_csv('test.tsv',sep='\t',header=None,dtype={0:str,1:str})

    tfidf_vectorizer = TfidfVectorizer()

    train_X = tfidf_vectorizer.fit_transform(train[1])
    train_y = vectorize_labels(train)

    dev_X = tfidf_vectorizer.transform(dev[1])
    dev_y = vectorize_labels(dev)

    test_X = tfidf_vectorizer.transform(test[1])
    test_y = vectorize_labels(test)

    clf = OneVsRestClassifier(SVC(probability=True, kernel='linear'))
    clf.fit(train_X, train_y)

    dev_pred = clf.predict(dev_X)
    test_pred = clf.predict(test_X)
    
    return clf, dev_y, test_y, dev_pred, test_pred

In [63]:
clf, dev_y, test_y, dev_pred, test_pred = train_svm('train.tsv')

In [64]:
def eval_model_on_file(clsf, train_file, test_file):
    train = pd.read_csv(train_file,sep='\t',header=None,dtype={0:str,1:str})
    dev = pd.read_csv(test_file,sep='\t',header=None,dtype={0:str,1:str})
    
    tfidf_vectorizer = TfidfVectorizer()

    train_X = tfidf_vectorizer.fit_transform(train[1])
    
    dev_X = tfidf_vectorizer.transform(dev[1])
    dev_y = vectorize_labels(dev)
    
    dev_pred = clsf.predict(dev_X)

    print(metrics.accuracy_score(dev_y, dev_pred),metrics.f1_score(dev_y, dev_pred, average='micro'))
    
    return dev_pred

In [77]:
dev_pred = eval_model_on_file(clf, 'train.tsv','dev.tsv')

0.5880149812734082 0.7235859124866597


In [65]:
eval_model_on_file(clf, 'train.tsv','cord19_test.tsv')

0.26 0.5560165975103735


In [78]:
clfs = {}

for ratio in ['0.50']:
    print(ratio)
    clf, dev_y, test_y, dev_pred, test_pred = train_svm('train_{}.tsv'.format(ratio))
    
    clfs[ratio] = clf
    
    print(metrics.accuracy_score(dev_y, dev_pred),metrics.f1_score(dev_y, dev_pred, average='micro'))
    print(metrics.accuracy_score(test_y, test_pred),metrics.f1_score(test_y, test_pred, average='micro'))
    print('')

0.50
0.5617977528089888 0.6969529085872576
0.580625 0.7259548369110677



In [42]:
from sklearn import metrics

In [73]:
tfidf_vectorizer = TfidfVectorizer()

train_X = tfidf_vectorizer.fit_transform(train[1])
train_y = vectorize_labels(train)

dev_X = tfidf_vectorizer.transform(dev[1])
dev_y = vectorize_labels(dev)

test_X = tfidf_vectorizer.transform(test[1])
test_y = vectorize_labels(test)

In [74]:
lr = OneVsRestClassifier(LogisticRegression())
lr.fit(train_X, train_y)

OneVsRestClassifier(estimator=LogisticRegression())

In [75]:
dev_pred = lr.predict(dev_X)
test_pred = lr.predict(test_X)

metrics.accuracy_score(dev_y, dev_pred),metrics.f1_score(dev_y, dev_pred, average='micro')

(0.533083645443196, 0.6745230078563412)

In [76]:
metrics.accuracy_score(test_y, test_pred),metrics.f1_score(test_y, test_pred, average='micro')

(0.585, 0.7223178427997704)

In [None]:
dev_preds = pickle.load(open('../hedwig-data/datasets/LitCovid/dev_longformer-base_train_0.05.tsv_1024_metrics.p','rb'))

In [None]:
dev_preds

In [None]:
metrics.f1_score(dev_y[1:], dev_preds[0],average='micro')

In [None]:
metrics.accuracy_score(dev_y[1:], dev_preds[0])

In [None]:
lens = [len(t.split()) for t in train[1]]

In [None]:
np.mean(lens)

In [62]:
metric_dict = {}

for filename in glob.glob('../hedwig-data/datasets/LitCovid/dev*biobert*512*metrics*'):
    metric_tuple = pickle.load(open(filename,'rb'))
    acc_dev ,_,_,f1_dev, _ = metric_tuple[0]
        
    try:
        metric_tuple = pickle.load(open(filename.replace('dev','test'),'rb'))
        acc_test ,_,_,f1_test,_ = metric_tuple[0]
    except:
        print(filename)
        f1_test = 0
        acc_test = 0
        
    metric_dict[filename] = [acc_dev,acc_test,f1_dev,f1_test]
    
m_list = list(metric_dict.values())
m_names = ['Dev Acc','Test Acc', 'Dev F1', 'Test F1']

pd.DataFrame(m_list,columns=m_names,index=[m.split('/')[-1][4:-10] for m in metric_dict.keys()]).sort_index()

Unnamed: 0,Dev Acc,Test Acc,Dev F1,Test F1
biobert_cord19_512,0.39,0.674375,0.692308,0.813176
biobert_pretrained_run_512,0.667917,0.645,0.802748,0.787938
biobert_train.tsv_512,0.65875,0.674171,0.802457,0.813089
biobert_train_0.01.tsv_512,0.385,0.387117,0.41413,0.405383
biobert_train_0.05.tsv_512,0.5675,0.561601,0.688985,0.689312
biobert_train_0.10.tsv_512,0.63125,0.639149,0.750259,0.756517
biobert_train_0.20.tsv_512,0.65375,0.666041,0.797115,0.793867


In [None]:

metric_dict = {}

# dev_y = [d[:-1] for d in dev_y]
# test_y = [d[:-1] for d in test_y]

for filename in glob.glob('../hedwig-data/datasets/LitCovid/dev*long*train.tsv*prediction*'):
    dev_preds = pickle.load(open(filename,'rb'))[0]

#     dev_preds = [d[:-1] for d in dev_preds]

    f1_dev = metrics.f1_score(dev_y[1:], dev_preds,average='micro')
    acc_dev = metrics.accuracy_score(dev_y[1:], dev_preds)

    try:
        test_preds = pickle.load(open(filename.replace('dev','test'),'rb'))[0]
#         test_preds = [d[:-1] for d in test_preds]

        f1_test = metrics.f1_score(test_y[1:], test_preds,average='micro')
        acc_test = metrics.accuracy_score(test_y[1:], test_preds)
    except:
        print(filename)
        f1_test = 0
        acc_test = 0
        
    metric_dict[filename] = [acc_dev,acc_test,f1_dev,f1_test]
    
m_list = list(metric_dict.values())
m_names = ['Dev Acc','Test Acc', 'Dev F1', 'Test F1']

pd.DataFrame(m_list,columns=m_names,index=[m.split('/')[-1][4:-10] for m in metric_dict.keys()]).sort_index()

In [None]:
pd.DataFrame({'model': metrics.keys(),'m_list