In [96]:
import pandas as pd

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# flair model stuff
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
# from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

# fastai ULMFiT model stuff
import fastai
from fastai import *
from fastai.text import *

# fasttext model stuff
import fasttext

# keras for cnn stuff
from nltk import word_tokenize
import gensim
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [2]:
# read in data and format for model

train = pd.read_csv('../data/email_intent_train.txt', header=None, sep='\t')
train.columns = ["label", "text"]
train['label'] = '__label__' + train['label'].astype(str)

test = pd.read_csv('../data/email_intent_test.txt', header=None, sep='\t')
test.columns = ["label", "text"]
test['label'] = '__label__' + test['label'].astype(str)

data = train.append(test)
data = data.sample(frac=1).reset_index(drop=True)

In [3]:
# split data into train, dev, and test

data.iloc[0:int(len(data)*0.8)].to_csv('../data/practice_model/train.csv', sep='\t', index = False, header = None)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('../data/practice_model/test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('../data/practice_model/dev.csv', sep='\t', index = False, header = False);

## Baseline: Naive Bayes, Linear SVM, Logistic Regression

* Easy Implementation - https://medium.com/data-from-the-trenches/text-classification-the-first-step-toward-nlp-mastery-f5f95d525d73

In [4]:
# preprocessing

def preprocess(text):
    
    # strip whitespaces
    text = text.strip()
    
    # remove numbers
    text = re.sub("(\d)+", "", text)
    
    # lower case everything
    text = text.lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    return text

data["text"] = data["text"].apply(preprocess)

In [5]:
# split data into train and test

shuffled_data = data.sample(frac=1).reset_index(drop=True)
s_train = shuffled_data.iloc[0:int(len(data)*0.9)]
s_test = shuffled_data.iloc[int(len(data)*0.9):]

In [6]:
# Bag of Words vectorization

# this vectorizer will skip stop words
vectorizer = CountVectorizer(
    stop_words="english",
    preprocessor=preprocess
)

# fit the vectorizer on the training text
training_features = vectorizer.fit_transform(s_train["text"])

# Transform each text into a vector of word counts
test_features = vectorizer.transform(s_test["text"])

In [7]:
# Term Frequency vectorization
# TODO Different kinds of tf

# this vectorizer will skip stop words
tf_vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=preprocess,
                             ngram_range=(1, 2))


# fit the vectorizer on the training text
tf_training_features = tf_vectorizer.fit_transform(s_train["text"])

# Transform each text into a vector of word counts
tf_test_features = tf_vectorizer.transform(s_test["text"])

In [8]:
# Training Naive Bayes

nb_model = GaussianNB()
nb_model.fit(training_features.toarray(), s_train["label"])
nb_acc = nb_model.score(test_features.toarray(), s_test["label"])

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(nb_acc*100))

nb_tf_model = GaussianNB()
nb_tf_model.fit(tf_training_features.toarray(), s_train["label"])
nb_tf_acc = nb_tf_model.score(tf_test_features.toarray(), s_test["label"])

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(nb_tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 55.27
Accuracy on the test dataset with TF vectorization: 59.35


In [9]:
# Training Linear SVM

model = LinearSVC()
model.fit(training_features, s_train["label"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(s_test["label"], y_pred)

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(acc*100))

tf_model = LinearSVC()
tf_model.fit(tf_training_features, s_train["label"])
tf_y_pred = tf_model.predict(tf_test_features)

# Evaluation
tf_acc = accuracy_score(s_test["label"], tf_y_pred)

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 71.18
Accuracy on the test dataset with TF vectorization: 72.90


In [10]:
# Training Logistic Regression

log_model = LogisticRegression()
log_model.fit(training_features, s_train["label"])
log_acc = log_model.score(test_features, s_test["label"])

print("Accuracy on the test dataset with BOW vectorization: {:.2f}".format(log_acc*100))

log_tf_model = LogisticRegression()
log_tf_model.fit(tf_training_features, s_train["label"])
log_tf_acc = log_tf_model.score(tf_test_features, s_test["label"])

print("Accuracy on the test dataset with TF vectorization: {:.2f}".format(log_tf_acc*100))

Accuracy on the test dataset with BOW vectorization: 71.18
Accuracy on the test dataset with TF vectorization: 70.54


## Flair Model: Allows combination of different kinds of word embeddings

* Model - https://github.com/flairNLP/flair
* Explanation - https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/
* Easy Implementation - https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

In [11]:
# load in corpus

# this is the folder in which train, test and dev files reside
data_folder = '../data/practice_model'

# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 0: "label_topic"}

# load corpus containing training, test and dev data
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=False,
                                         delimiter='\t',    
) 

2020-06-01 12:05:35,189 Reading data from ../data/practice_model
2020-06-01 12:05:35,190 Train: ../data/practice_model/train.csv
2020-06-01 12:05:35,191 Dev: ../data/practice_model/dev.csv
2020-06-01 12:05:35,192 Test: ../data/practice_model/test.csv


In [66]:
# using Flair model

word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)

trainer.train('./', max_epochs=10)

2020-05-24 16:36:44,102 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2


100%|██████████| 160000128/160000128 [03:39<00:00, 730082.18B/s] 

2020-05-24 16:40:23,868 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2 to cache at /Users/nataliewang/.flair/embeddings/glove.gensim.vectors.npy





2020-05-24 16:40:24,094 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpqk89ldj2
2020-05-24 16:40:24,815 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm


100%|██████████| 21494764/21494764 [00:26<00:00, 812018.60B/s] 

2020-05-24 16:40:52,148 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm to cache at /Users/nataliewang/.flair/embeddings/glove.gensim
2020-05-24 16:40:52,181 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmpvk4ktywm





2020-05-24 16:40:53,986 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051


100%|██████████| 19689779/19689779 [00:24<00:00, 805576.13B/s] 

2020-05-24 16:41:19,129 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051 to cache at /Users/nataliewang/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-05-24 16:41:19,162 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp3mjfp051





2020-05-24 16:41:19,905 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch


100%|██████████| 19689779/19689779 [00:20<00:00, 943009.28B/s] 

2020-05-24 16:41:41,701 copying /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch to cache at /Users/nataliewang/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2020-05-24 16:41:41,734 removing temp file /var/folders/__/6ygy68tn6c74340_57scyx500000gn/T/tmp6ci7guch



  


2020-05-24 16:41:41,792 Computing label dictionary. Progress:


100%|██████████| 4182/4182 [00:01<00:00, 3970.32it/s]

2020-05-24 16:41:42,918 [b'__label__No', b'__label__Yes']
2020-05-24 16:41:42,925 ----------------------------------------------------------------------------------------------------
2020-05-24 16:41:42,927 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=214




2020-05-24 16:42:01,902 epoch 1 - iter 11/117 - loss 0.70836253 - samples/sec: 18.65
2020-05-24 16:42:22,094 epoch 1 - iter 22/117 - loss 0.69428441 - samples/sec: 17.50
2020-05-24 16:42:43,909 epoch 1 - iter 33/117 - loss 0.69242128 - samples/sec: 16.19
2020-05-24 16:43:08,382 epoch 1 - iter 44/117 - loss 0.67784121 - samples/sec: 14.85
2020-05-24 16:43:29,958 epoch 1 - iter 55/117 - loss 0.67401460 - samples/sec: 16.36
2020-05-24 16:43:49,654 epoch 1 - iter 66/117 - loss 0.66823633 - samples/sec: 17.92
2020-05-24 16:44:12,963 epoch 1 - iter 77/117 - loss 0.66822653 - samples/sec: 15.14
2020-05-24 16:44:32,273 epoch 1 - iter 88/117 - loss 0.66663722 - samples/sec: 19.02
2020-05-24 16:44:53,397 epoch 1 - iter 99/117 - loss 0.67022049 - samples/sec: 16.72
2020-05-24 16:45:16,356 epoch 1 - iter 110/117 - loss 0.66924216 - samples/sec: 15.37
2020-05-24 16:45:32,523 ----------------------------------------------------------------------------------------------------
2020-05-24 16:45:32,524 

2020-05-24 17:11:24,005 epoch 7 - iter 66/117 - loss 0.51994762 - samples/sec: 15.92
2020-05-24 17:11:49,245 epoch 7 - iter 77/117 - loss 0.51758517 - samples/sec: 13.98
2020-05-24 17:12:09,247 epoch 7 - iter 88/117 - loss 0.52253921 - samples/sec: 17.65
2020-05-24 17:12:32,898 epoch 7 - iter 99/117 - loss 0.52428676 - samples/sec: 14.92
2020-05-24 17:12:51,971 epoch 7 - iter 110/117 - loss 0.51994464 - samples/sec: 18.52
2020-05-24 17:13:12,028 ----------------------------------------------------------------------------------------------------
2020-05-24 17:13:12,035 EPOCH 7 done: loss 0.5197 - lr 0.1000000
2020-05-24 17:13:48,998 DEV : loss 1.6004676818847656 - score 0.4181
2020-05-24 17:13:49,217 BAD EPOCHS (no improvement): 1
2020-05-24 17:13:49,224 ----------------------------------------------------------------------------------------------------
2020-05-24 17:14:12,126 epoch 8 - iter 11/117 - loss 0.64657290 - samples/sec: 15.46
2020-05-24 17:14:30,924 epoch 8 - iter 22/117 - lo

{'test_score': 0.771551724137931,
 'dev_score_history': [0.5021551724137931,
  0.5258620689655172,
  0.4849137931034483,
  0.625,
  0.7068965517241379,
  0.7262931034482759,
  0.41810344827586204,
  0.7327586206896551,
  0.6961206896551724,
  0.7413793103448276],
 'train_loss_history': [0.6649040309791892,
  0.6310247648984958,
  0.6030245005575001,
  0.5808376736111112,
  0.5557609664069282,
  0.5466362495198209,
  0.5196669252000303,
  0.5333108894335918,
  0.5089793034598359,
  0.4998229920354664],
 'dev_loss_history': [0.7192593812942505,
  0.7386118769645691,
  0.9432142376899719,
  0.8019097447395325,
  0.5636223554611206,
  0.5493430495262146,
  1.6004676818847656,
  0.5582072734832764,
  0.624588668346405,
  0.529371976852417]}

### Flair Test Score with default params: 0.77155

## FastAI: ULMFiT - Discriminative fine-tuning, Slanted triangular learning rates, and Gradual unfreezing

* Model Doc - https://docs.fast.ai/index.html
* Explanation - https://arxiv.org/pdf/1801.06146.pdf
* Easy Implementation - https://www.analyticsvidhya.com/blog/2018/11/tutorial-text-classification-ulmfit-fastai-library/

In [30]:
train = pd.read_csv("../data/practice_model/train.csv", sep='\t', header = None)
dev = pd.read_csv("../data/practice_model/dev.csv", sep='\t', header = None)
test = pd.read_csv("../data/practice_model/test.csv", sep='\t', header = None)

# Language model data
data_lm = TextLMDataBunch.from_df(train_df=train, valid_df=dev, test_df=test, path="")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = train, valid_df = dev, vocab=data_lm.train_ds.vocab, bs=32)

In [32]:
# fine-tune pre-trained language model

learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)

In [33]:
# train the learner object with learning rate = 1e-2

learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,4.824266,4.155685,0.240179,00:53


In [34]:
# save encoder for later

learn.save_encoder('fastai_encoder')

In [37]:
# build classifier with encoder

learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('fastai_encoder')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3719 items)
x: TextList
xxbos i do n't get to enjoy life 's pleasures .,xxbos xxmaj you do n't need to do this right away , when you get a chance go into xxmaj unify for xxmaj feb and show an actual volume of 0 for the 25th - 28th and a volume of xxunk for the 28th .,xxbos xxmaj in a recent discussion with one of xxmaj enron 's folks the xxmaj city ( or their lawyer ) raised the issue that xxmaj xxunk xxmaj xxunk had represented them in the past , and that there might be a potential xxunk .,xxbos i did n't get home until 10 and brian was on the phone xxunk 11 , so i figured you were xxunk by then .,xxbos i will be here all day tomorrow .
y: CategoryList
__label__No,__label__Yes,__label__No,__label__No,__label__No
Path: .;

Valid: LabelList (465 items)
x: TextList
xxbos xxmaj please add xxmaj xxunk xxmaj xxunk to our bidweek index survey spreadsheet .,xxbos i would like to invite you to address the group on e - xxmaj xxunk issues at 

In [38]:
# fit model

learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.642192,0.537274,0.731183,00:47
1,0.591593,0.57696,0.666667,00:44
2,0.600386,0.524698,0.724731,00:53
3,0.587529,0.505716,0.744086,00:53
4,0.567034,0.539854,0.72043,00:47
5,0.560789,0.52064,0.72043,00:46
6,0.567282,0.515134,0.71828,00:47
7,0.553607,0.495926,0.729032,00:47
8,0.548344,0.477352,0.769892,00:47
9,0.553365,0.49266,0.746237,00:47


## FB's FastText

* Model - https://github.com/facebookresearch/fastText
* Explanation - https://arxiv.org/pdf/1607.01759.pdf
* Easy Implementation - https://idevji.com/2017/11/04/tutorial-text-classification-with-python-using-fasttext/

In [69]:
ft_model = fasttext.train_supervised("../data/practice_model/train.csv", epoch=10, loss='hs')
results = ft_model.test("../data/practice_model/dev.csv")

# print("Precision on dev dataset: {:.2f}".format(results[1]))
# print("Recall on dev dataset: {:.2f}".format(results[2]))

test_data = pd.read_csv("../data/practice_model/test.csv", sep='\t', header=None)

# Evaluation
ft_y_pred = test_data[1].apply(lambda x: ft_model.predict(x)[0][0])
ft_acc = accuracy_score(test_data[0], ft_y_pred)

print("Accuracy on the test dataset with FastText: {:.2f}".format(ft_acc*100))

Accuracy on the test dataset with FastText: 80.65


## CNN

* Explanation - https://www.aclweb.org/anthology/D14-1181.pdf
* Easy Implementation - https://towardsdatascience.com/cnn-sentiment-analysis-1d16b7c5a0e7

In [58]:
train = pd.read_csv("../data/practice_model/train.csv", sep='\t', header = None)
dev = pd.read_csv("../data/practice_model/dev.csv", sep='\t', header = None)
test = pd.read_csv("../data/practice_model/test.csv", sep='\t', header = None)

In [67]:
s_train["tokens"] = [word_tokenize(sentence) for sentence in s_train["text"]]
s_test["tokens"] = [word_tokenize(sentence) for sentence in s_test["text"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [90]:
# training vocabulary

all_training_words = [word for tokens in s_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in s_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

70297 words total, with a vocabulary size of 6820
Max sentence length is 212


In [91]:
# test vocabulary 

all_test_words = [word for tokens in s_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in s_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

7754 words total, with a vocabulary size of 1945
Max sentence length is 89


In [141]:
# load in google news word2vec model

word2vec = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)  


In [142]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [143]:
training_embeddings = get_word2vec_embeddings(word2vec, s_train, generate_missing=True)

In [144]:

MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [145]:
train_nos = [1 if label == "__label__No" else 0 for label in s_train["label"]]
train_yes = [0 if label == "__label__No" else 1 for label in s_train["label"]]

s_train["pos"] = train_yes
s_train["neg"] = train_nos

test_nos = [1 if label == "__label__No" else 0 for label in s_test["label"]]
test_yes = [0 if label == "__label__No" else 1 for label in s_test["label"]]

s_test["pos"] = test_yes
s_test["neg"] = test_nos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

In [146]:
s_train.head()

Unnamed: 0,label,text,tokens,pos,neg
0,__label__No,quora keeps throwing mouth watering insults at...,"[quora, keeps, throwing, mouth, watering, insu...",0,1
1,__label__No,just after lunch if possible,"[just, after, lunch, if, possible]",0,1
2,__label__No,parquet ever get in touch with you,"[parquet, ever, get, in, touch, with, you]",0,1
3,__label__Yes,i wanted to see if we could set up a time to m...,"[i, wanted, to, see, if, we, could, set, up, a...",1,0
4,__label__Yes,chris you set up a meeting with me next week t...,"[chris, you, set, up, a, meeting, with, me, ne...",1,0


In [179]:
s_test.head()

Unnamed: 0,label,text,tokens,pos,neg
4184,__label__Yes,can you come and see me when you get in,"[can, you, come, and, see, me, when, you, get,...",1,0
4185,__label__Yes,if questions arise around an eol deal please ...,"[if, questions, arise, around, an, eol, deal, ...",1,0
4186,__label__No,if you re looking for work or just want to ma...,"[if, you, re, looking, for, work, or, just, wa...",0,1
4187,__label__No,when you sow a faith seed let me tell you w...,"[when, you, sow, a, faith, seed, let, me, tell...",0,1
4188,__label__Yes,quick give me a cheap stock that looks promis...,"[quick, give, me, a, cheap, stock, that, looks...",1,0


In [147]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(s_train["text"].tolist())
training_sequences = tokenizer.texts_to_sequences(s_train["text"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 6838 unique tokens.


In [148]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [149]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(6839, 300)


In [150]:
test_sequences = tokenizer.texts_to_sequences(s_test["text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [151]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [152]:
label_names = ['pos', 'neg']

In [153]:
y_train = s_train[label_names].values

In [161]:
y_train

array([[0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1]])

In [162]:
x_train = train_cnn_data
y_tr = y_train

In [163]:
len(list(label_names))

2

In [164]:

model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 50, 300)      2051700     input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_24 (Conv1D)              (None, 49, 200)      120200      embedding_8[0][0]                
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 48, 200)      180200      embedding_8[0][0]                
____________________________________________________________________________________________

In [165]:
num_epochs = 10
batch_size = 34

In [166]:
# train model

hist = model.fit(x_train, y_train, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 3765 samples, validate on 419 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [167]:
# test model

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [168]:
labels = [1, 0]

In [169]:

prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [178]:
sum(s_test.pos==prediction_labels)/len(prediction_labels)

0.8344086021505376

In [174]:
s_test.label.value_counts()

__label__No     270
__label__Yes    195
Name: label, dtype: int64

## HuggingFace Transformers

* Model - https://github.com/huggingface/transformers
* Explanation - 
* Easy Implementation - https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed

## EXAM: Explicit Interaction Mechanism towards Text Classification (Encode, Interaction, Aggregation)

* Model - https://github.com/NonvolatileMemory/AAAI_2019_EXAM
* Explanation - https://arxiv.org/pdf/1811.09386.pdf