In [1]:
from LanguageModels.Word2Vec import Word2Vec
from LanguageModels.BagOfWords import BagOfWords
from LanguageModels.CustomWord2Vec import CustomWord2Vec
from Preprocessing.LemmatizerPreprocessor import LemmatizerPreprocessor
from Preprocessing.DataLoader import DataLoader
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.manifold import TSNE
import umap


%matplotlib

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ryan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ryan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ryan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Using matplotlib backend: Qt5Agg


In [2]:
# Load data
data = DataLoader('../data/EMNLP2020.csv').load()

In [3]:
# Load preprocessor
lp = LemmatizerPreprocessor()

In [4]:
#### Load language model

## Word2Vec Pretrained
# !wget http://nlp.stanford.edu/data/glove.6B.zip
w2v = Word2Vec(path='../data/glove.6B/glove.6B.200d.txt') 

## Word2Vec Pretrained + Finetuned
# w2v = CustomWord2Vec('../data/glove.6B/glove.6B.200d.finetuned.p')


## Custom word2vec
# w2v = CustomWord2Vec('../data/customw2v.p')

## Bag Of Words
# w2v = BagOfWords()

In [5]:
# apply preprocessing and vectorization to create text features

#### multilabel case
X_multilabel, y_multilabel = w2v.featurize(data, lp, mode='multilabel')
# X, y = bow.featurize(data, lp, mode='multilabel')

print(X_multilabel.shape, y_multilabel.shape)

#### multiclass case
X_multiclass, y_multiclass = w2v.featurize(data, lp, mode='multiclass', remove_neg_samples=True)
# X, y = bow.featurize(data, lp, mode='multiclass')

print(X_multiclass.shape, y_multiclass.shape, max(y_multiclass))

#### binary case (note 0=in conference, 1=not in conference)
X_binary, y_binary = w2v.featurize(data, lp, mode='binary')
# X, y = bow.featurize(data, lp, mode='binary')

print(X_binary.shape, y_binary.shape, max(y_binary))

  0%|          | 0/1450 [00:00<?, ?it/s]

(1327, 200) (1327, 2)


  0%|          | 0/1450 [00:00<?, ?it/s]

(627, 200) (627, 1) [10]


  0%|          | 0/1450 [00:00<?, ?it/s]

(1327, 200) (1327, 1) [1]


## Plot 2d and 3d projections  

### Binary

In [None]:
X, y = (X_binary, y_binary)

In [None]:
# X_embedded2d = TSNE(n_components=2).fit_transform(X)
# X_embedded3d = TSNE(n_components=3).fit_transform(X)

X_embedded2d = umap.UMAP(n_components=2).fit_transform(X)
X_embedded3d = umap.UMAP(n_components=3).fit_transform(X)

In [None]:
plt.scatter(X_embedded2d[:,0], X_embedded2d[:,1], c=y)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(X_embedded3d[:,0], X_embedded3d[:,1], X_embedded3d[:,2], c=y)

### Multiclass

In [6]:
X, y = (X_multiclass, y_multiclass)

In [7]:
# X_embedded2d = TSNE(n_components=2).fit_transform(X)
# X_embedded3d = TSNE(n_components=3).fit_transform(X)

X_embedded2d = umap.UMAP(n_components=2).fit_transform(X)
X_embedded3d = umap.UMAP(n_components=3).fit_transform(X)

In [8]:
import matplotlib.pyplot as plt
plt.scatter(X_embedded2d[:,0], X_embedded2d[:,1], c=y)

<matplotlib.collections.PathCollection at 0x18e6603ddd8>

In [9]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(X_embedded3d[:,0], X_embedded3d[:,1], X_embedded3d[:,2], c=y)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x18e65e34e10>

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y.reshape(-1,), test_size=0.33, random_state=42)

# clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
# clf.fit(X_train, y_train)

In [None]:
# accuracy_score(y_train, clf.predict(X_train) )

In [None]:
# accuracy_score(y_test, clf.predict(X_test))

In [None]:
# print(classification_report(y_test, clf.predict(X_test)))

In [None]:
# counts = {}
# for label in y:
#     try:
#         counts[label[0]] += 1
#     except:
#         counts.update({label[0]:1})

In [None]:
# from sklearn.cluster import KMeans

# init = [w2v.convert(['language', 'model', 'natural']), w2v.convert(['server', 'cyber', 'latency'])]

# kmeans = KMeans(n_clusters=2, random_state=0, init=np.array(init)).fit(X)