In [73]:
import pandas as pd

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# flair model stuff
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
# from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

# fastai ULMFiT model stuff
import fastai
from fastai import *
from fastai.text import *

# fasttext model stuff
import fasttext

In [2]:
# read in data and format for model

train = pd.read_csv('../data/email_intent_train.txt', header=None, sep='\t')
train.columns = ["label", "text"]
train['label'] = '__label__' + train['label'].astype(str)

test = pd.read_csv('../data/email_intent_test.txt', header=None, sep='\t')
test.columns = ["label", "text"]
test['label'] = '__label__' + test['label'].astype(str)

data = train.append(test)
data = data.sample(frac=1).reset_index(drop=True)

In [3]:
# split data into train, dev, and test

data.iloc[0:int(len(data)*0.8)].to_csv('../data/practice_model/train.csv', sep='\t', index = False, header = None)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('../data/practice_model/test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('../data/practice_model/dev.csv', sep='\t', index = False, header = False);

## Baseline: Naive Bayes, Linear SVM, Logistic Regression

* Easy Implementation - https://medium.com/data-from-the-trenches/text-classification-the-first-step-toward-nlp-mastery-f5f95d525d73

In [4]:
# preprocessing

def preprocess(text):
    
    # strip whitespaces
    text = text.strip()
    
    # remove numbers
    text = re.sub("(\d)+", "", text)
    
    # lower case everything
    text = text.lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    return text

data["text"] = data["text"].apply(preprocess)

In [5]:
# split data into train and test

shuffled_data = data.sample(frac=1).reset_index(drop=True)
s_train = shuffled_data.iloc[0:int(len(data)*0.9)]
s_test = shuffled_data.iloc[int(len(data)*0.9):]

In [6]:
# Bag of Words vectorization

# this vectorizer will skip stop words
vectorizer = CountVectorizer(
    stop_words="english",
    preprocessor=preprocess
)

# fit the vectorizer on the training text
training_features = vectorizer.fit_transform(s_train["text"])

# Transform each text into a vector of word counts
test_features = vectorizer.transform(s_test["text"])

In [7]:
# Term Frequency vectorization

# this vectorizer will skip stop words
tf_vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=preprocess,
                             ngram_range=(1, 2))


# fit the vectorizer on the training text
tf_training_features = tf_vectorizer.fit_transform(s_train["text"])

# Transform each text into a vector of word counts
tf_test_features = tf_vectorizer.transform(s_test["text"])

In [8]:
# Training Naive Bayes

nb_model = GaussianNB()
nb_model.fit(training_features.toarray(), s_train["label"])
nb_acc = nb_model.score(test_features.toarray(), s_test["label"])

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(nb_acc*100))

nb_tf_model = GaussianNB()
nb_tf_model.fit(tf_training_features.toarray(), s_train["label"])
nb_tf_acc = nb_tf_model.score(tf_test_features.toarray(), s_test["label"])

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(nb_tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 58.06
Accuracy on the test dataset with TF vectorization: 60.00


In [9]:
# Training Linear SVM

model = LinearSVC()
model.fit(training_features, s_train["label"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(s_test["label"], y_pred)

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(acc*100))

tf_model = LinearSVC()
tf_model.fit(tf_training_features, s_train["label"])
tf_y_pred = tf_model.predict(tf_test_features)

# Evaluation
tf_acc = accuracy_score(s_test["label"], tf_y_pred)

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 67.53
Accuracy on the test dataset with TF vectorization: 72.90


In [10]:
# Training Logistic Regression

log_model = LogisticRegression()
log_model.fit(training_features, s_train["label"])
log_acc = log_model.score(test_features, s_test["label"])

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(log_acc*100))

log_tf_model = LogisticRegression()
log_tf_model.fit(tf_training_features, s_train["label"])
log_tf_acc = log_tf_model.score(tf_test_features, s_test["label"])

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(log_tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 73.76
Accuracy on the test dataset with TF vectorization: 72.69


## Flair Model: Allows combination of different kinds of word embeddings

* Model - https://github.com/flairNLP/flair
* Explanation - https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/
* Easy Implementation - https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

In [None]:
# load in corpus

# this is the folder in which train, test and dev files reside
data_folder = '../data/practice_model'

# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 0: "label_topic"}

# load corpus containing training, test and dev data
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=False,
                                         delimiter='\t',    
) 

In [66]:
# using Flair model

word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)

trainer.train('./', max_epochs=10)

2020-05-24 16:36:44,102 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2


100%|██████████| 160000128/160000128 [03:39<00:00, 730082.18B/s] 

2020-05-24 16:40:23,868 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2 to cache at /Users/nataliewang/.flair/embeddings/glove.gensim.vectors.npy





2020-05-24 16:40:24,094 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2
2020-05-24 16:40:24,815 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm


100%|██████████| 21494764/21494764 [00:26<00:00, 812018.60B/s] 

2020-05-24 16:40:52,148 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm to cache at /Users/nataliewang/.flair/embeddings/glove.gensim
2020-05-24 16:40:52,181 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm





2020-05-24 16:40:53,986 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051


100%|██████████| 19689779/19689779 [00:24<00:00, 805576.13B/s] 

2020-05-24 16:41:19,129 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051 to cache at /Users/nataliewang/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-05-24 16:41:19,162 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051





2020-05-24 16:41:19,905 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch


100%|██████████| 19689779/19689779 [00:20<00:00, 943009.28B/s] 

2020-05-24 16:41:41,701 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch to cache at /Users/nataliewang/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2020-05-24 16:41:41,734 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch



  


2020-05-24 16:41:41,792 Computing label dictionary. Progress:


100%|██████████| 4182/4182 [00:01<00:00, 3970.32it/s]

2020-05-24 16:41:42,918 [b'__label__No', b'__label__Yes']
2020-05-24 16:41:42,925 ----------------------------------------------------------------------------------------------------
2020-05-24 16:41:42,927 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=214




2020-05-24 16:42:01,902 epoch 1 - iter 11/117 - loss 0.70836253 - samples/sec: 18.65
2020-05-24 16:42:22,094 epoch 1 - iter 22/117 - loss 0.69428441 - samples/sec: 17.50
2020-05-24 16:42:43,909 epoch 1 - iter 33/117 - loss 0.69242128 - samples/sec: 16.19
2020-05-24 16:43:08,382 epoch 1 - iter 44/117 - loss 0.67784121 - samples/sec: 14.85
2020-05-24 16:43:29,958 epoch 1 - iter 55/117 - loss 0.67401460 - samples/sec: 16.36
2020-05-24 16:43:49,654 epoch 1 - iter 66/117 - loss 0.66823633 - samples/sec: 17.92
2020-05-24 16:44:12,963 epoch 1 - iter 77/117 - loss 0.66822653 - samples/sec: 15.14
2020-05-24 16:44:32,273 epoch 1 - iter 88/117 - loss 0.66663722 - samples/sec: 19.02
2020-05-24 16:44:53,397 epoch 1 - iter 99/117 - loss 0.67022049 - samples/sec: 16.72
2020-05-24 16:45:16,356 epoch 1 - iter 110/117 - loss 0.66924216 - samples/sec: 15.37
2020-05-24 16:45:32,523 ----------------------------------------------------------------------------------------------------
2020-05-24 16:45:32,524 

2020-05-24 17:11:24,005 epoch 7 - iter 66/117 - loss 0.51994762 - samples/sec: 15.92
2020-05-24 17:11:49,245 epoch 7 - iter 77/117 - loss 0.51758517 - samples/sec: 13.98
2020-05-24 17:12:09,247 epoch 7 - iter 88/117 - loss 0.52253921 - samples/sec: 17.65
2020-05-24 17:12:32,898 epoch 7 - iter 99/117 - loss 0.52428676 - samples/sec: 14.92
2020-05-24 17:12:51,971 epoch 7 - iter 110/117 - loss 0.51994464 - samples/sec: 18.52
2020-05-24 17:13:12,028 ----------------------------------------------------------------------------------------------------
2020-05-24 17:13:12,035 EPOCH 7 done: loss 0.5197 - lr 0.1000000
2020-05-24 17:13:48,998 DEV : loss 1.6004676818847656 - score 0.4181
2020-05-24 17:13:49,217 BAD EPOCHS (no improvement): 1
2020-05-24 17:13:49,224 ----------------------------------------------------------------------------------------------------
2020-05-24 17:14:12,126 epoch 8 - iter 11/117 - loss 0.64657290 - samples/sec: 15.46
2020-05-24 17:14:30,924 epoch 8 - iter 22/117 - lo

{'test_score': 0.771551724137931,
 'dev_score_history': [0.5021551724137931,
  0.5258620689655172,
  0.4849137931034483,
  0.625,
  0.7068965517241379,
  0.7262931034482759,
  0.41810344827586204,
  0.7327586206896551,
  0.6961206896551724,
  0.7413793103448276],
 'train_loss_history': [0.6649040309791892,
  0.6310247648984958,
  0.6030245005575001,
  0.5808376736111112,
  0.5557609664069282,
  0.5466362495198209,
  0.5196669252000303,
  0.5333108894335918,
  0.5089793034598359,
  0.4998229920354664],
 'dev_loss_history': [0.7192593812942505,
  0.7386118769645691,
  0.9432142376899719,
  0.8019097447395325,
  0.5636223554611206,
  0.5493430495262146,
  1.6004676818847656,
  0.5582072734832764,
  0.624588668346405,
  0.529371976852417]}

### Flair Test Score with default params: 0.77155

## FastAI: ULMFiT - Discriminative fine-tuning, Slanted triangular learning rates, and Gradual unfreezing

* Explanation - https://arxiv.org/pdf/1801.06146.pdf
* Easy Implementation - https://www.analyticsvidhya.com/blog/2018/11/tutorial-text-classification-ulmfit-fastai-library/

## FB's FastText

* Model - https://github.com/facebookresearch/fastText
* Explanation - https://arxiv.org/pdf/1607.01759.pdf
* Easy Implementation - https://idevji.com/2017/11/04/tutorial-text-classification-with-python-using-fasttext/

In [69]:
ft_model = fasttext.train_supervised("../data/practice_model/train.csv", epoch=10, loss='hs')
results = ft_model.test("../data/practice_model/dev.csv")

# print("Precision on dev dataset: {:.2f}".format(results[1]))
# print("Recall on dev dataset: {:.2f}".format(results[2]))

test_data = pd.read_csv("../data/practice_model/test.csv", sep='\t', header=None)

# Evaluation
ft_y_pred = test_data[1].apply(lambda x: ft_model.predict(x)[0][0])
ft_acc = accuracy_score(test_data[0], ft_y_pred)

print("Accuracy on the test dataset with FastText: {:.2f}".format(ft_acc*100))

Accuracy on the test dataset with FastText: 80.65


## HuggingFace Transformers

* Model - https://github.com/huggingface/transformers
* Explanation - 
* Easy Implementation - https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed

## EXAM: Explicit Interaction Mechanism towards Text Classification (Encode, Interaction, Aggregation)

* Model - https://github.com/NonvolatileMemory/AAAI_2019_EXAM
* Explanation - https://arxiv.org/pdf/1811.09386.pdf