In [8]:
from LanguageModels.BERT import BERT
from LanguageModels.Word2Vec import Word2Vec
from LanguageModels.BagOfWords import BagOfWords
from LanguageModels.CustomWord2Vec import CustomWord2Vec
from Preprocessing.LemmatizerPreprocessor import LemmatizerPreprocessor
from Preprocessing.DataLoader import DataLoader
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.manifold import TSNE
import umap

import torch

%matplotlib notebook

In [9]:
is_cuda = True
is_cuda = is_cuda and torch.cuda.is_available() # will turn off cuda if the machine doesnt have a GPU

In [10]:
# Load data
data = DataLoader('../data/EMNLP2020.csv').load()

In [11]:
# Load preprocessor
lp = LemmatizerPreprocessor()

In [93]:
## Bag Of Words
lm = BagOfWords()

## Custom word2vec
# lm = CustomWord2Vec('../data/customw2v.p')

## Custom word2vec
# lm = CustomWord2Vec('../data/customw2v_50.p')

## W2V
# lm = Word2Vec(path='../data/glove.6B/glove.6B.200d.txt') 

## Word2Vec Pretrained + Finetuned
# lm = CustomWord2Vec('../data/glove.6B/glove.6B.200d.finetuned.p')

## Bert
# lm = BERT(cuda = is_cuda)

## SciBert
# lm = BERT(cuda = is_cuda, path = 'allenai/scibert_scivocab_uncased')


In [94]:
# apply preprocessing and vectorization to create text features

#### multilabel case
X_multiclass_full, y_multiclass_full = lm.featurize(data, lp, mode='multiclass', remove_neg_samples=False)
# X, y = bow.featurize(data, lp, mode='multilabel')

print(X_multiclass_full.shape, y_multiclass_full.shape)

#### multiclass case
X_multiclass, y_multiclass = lm.featurize(data, lp, mode='multiclass', remove_neg_samples=True)
# X, y = bow.featurize(data, lp, mode='multiclass')

print(X_multiclass.shape, y_multiclass.shape, max(y_multiclass))

#### binary case (note 0=in conference, 1=not in conference)
X_binary, y_binary = lm.featurize(data, lp, mode='binary')
# X, y = bow.featurize(data, lp, mode='binary')

print(X_binary.shape, y_binary.shape, max(y_binary))

(1450, 10861) (1450, 1)
(750, 6202) (750, 1) [10]
(1450, 10861) (1450, 1) [1]


## Test model expressiveness

### Binary

In [95]:
X, y = (X_binary, y_binary)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y.reshape(-1,), test_size=0.33, random_state=42, stratify=y)

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
print("Training Accuracy: ", accuracy_score(y_train, clf.predict(X_train)))
print("Testing Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))
print(classification_report(y_test, clf.predict(X_test)))

Training Accuracy:  0.9824922760041195
Testing Accuracy:  0.778705636743215
              precision    recall  f1-score   support

           0       0.79      0.78      0.78       248
           1       0.77      0.78      0.77       231

    accuracy                           0.78       479
   macro avg       0.78      0.78      0.78       479
weighted avg       0.78      0.78      0.78       479



In [97]:
# X_embedded2d = TSNE(n_components=2).fit_transform(X)
# X_embedded3d = TSNE(n_components=3).fit_transform(X)

# X_embedded2d = umap.UMAP(n_components=2).fit_transform(X)
# X_embedded3d = umap.UMAP(n_components=3).fit_transform(X)

In [98]:
# plt.scatter(X_embedded2d[:,0], X_embedded2d[:,1], c=y)

In [99]:
# fig = plt.figure()
# ax = fig.add_subplot(projection='3d')
# ax.scatter(X_embedded3d[:,0], X_embedded3d[:,1], X_embedded3d[:,2], c=y)

### Multiclass PO

In [100]:
X, y = (X_multiclass, y_multiclass)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y.reshape(-1,), test_size=0.33, random_state=42, stratify=y)

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
print("Training Accuracy: ", accuracy_score(y_train, clf.predict(X_train)))
print("Testing Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))
print(classification_report(y_test, clf.predict(X_test)))

Training Accuracy:  0.8446215139442231
Testing Accuracy:  0.2217741935483871
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        27
           2       0.22      1.00      0.36        55
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        46
           5       0.00      0.00      0.00        21
           6       0.00      0.00      0.00        19
           7       0.00      0.00      0.00         9
           8       0.00      0.00      0.00        23
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00        32

    accuracy                           0.22       248
   macro avg       0.02      0.10      0.04       248
weighted avg       0.05      0.22      0.08       248



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Multiclass Full

In [102]:
X, y = (X_multiclass_full, y_multiclass_full)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y.reshape(-1,), test_size=0.33, random_state=42, stratify=y)

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
print("Training Accuracy: ", accuracy_score(y_train, clf.predict(X_train)))
print("Testing Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))
print(classification_report(y_test, clf.predict(X_test)))

Training Accuracy:  0.756951596292482
Testing Accuracy:  0.4968684759916493
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        27
           2       0.88      0.13      0.22        55
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        46
           5       0.00      0.00      0.00        21
           6       0.00      0.00      0.00        19
           7       0.00      0.00      0.00         9
           8       0.00      0.00      0.00        23
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00        32
          11       0.49      1.00      0.66       231

    accuracy                           0.50       479
   macro avg       0.12      0.10      0.08       479
weighted avg       0.34      0.50      0.34       479



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
