# _Word Embeddings & Text Classification_

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
#!kaggle datasets download -d uciml/sms-spam-collection-dataset -p ~/.kaggle
#!unzip ~/.kaggle/sms-spam-collection-dataset.zip

In [25]:
from pathlib import Path
import pandas as pd
import re
import string

import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

import spacy
import numpy as np

## _Get Data_

In [33]:
def datapath():
    # gets path to resources folder
    return Path().cwd().parent / 'resources'

In [34]:
datapath = datapath()

In [6]:
def get_text(datapath):
    df = pd.read_csv(datapath/'spam.csv', encoding='ISO-8859-1')
    df = df[['v1','v2']]
    df.columns = ['labels','text']
    df.labels = df.labels.replace('ham',0)
    df.labels = df.labels.replace('spam',1)
    df = df.sample(frac=1, random_state=1).reset_index(drop=True)
    return df

In [7]:
df = get_text(datapath)
df.head(10)

Unnamed: 0,labels,text
0,0,Convey my regards to him
1,0,"[Û_] anyway, many good evenings to u! s"
2,0,My sort code is and acc no is . The bank is n...
3,0,Sorry i din lock my keypad.
4,1,"Hi babe its Chloe, how r u? I was smashed on s..."
5,0,Ok i thk i got it. Then u wan me 2 come now or...
6,0,Oi when you gonna ring
7,0,Will be office around 4 pm. Now i am going hos...
8,0,Have you heard about that job? I'm going to th...
9,0,Oh my God. I'm almost home


In [8]:
print('number of spams: ',len(df[df.labels==1]))
print('number of non-spams: ',len(df[df.labels==0]))
print('number of documents: ', len(df))

number of spams:  747
number of non-spams:  4825
number of documents:  5572


## _Text Normalization and Tokenization_

In [9]:
def sub_special_tokens(text):
    # note I stole many of these regexes regularly from S.O.
    # convert simple URLs to xxurl token (e.g. www.google.com, http:google.com -> xxurl)
    text = re.sub(r' www.', ' http://www.', text)
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' xxurl ', text)
    # convert (British) phone numbers to xxphone token (e.g. 09058097218 -> xxphone)
    pat = r'\d{3}[-\.\s]??\d{4}[-\.\s]??\d{4}|\d{5}[-\.\s]??\d{3}[-\.\s]??\d{3}|(?:\d{4}\)?[\s-]?\d{3}[\s-]?\d{4})'
    text = re.sub(pat, ' xxphone ', text)
    # replace monetary values with xxmon token
    text = text.replace('£','$ ')
    text = re.sub(r'(\d+)[ ]{0,1}p', '$ 0.\1', text)
    text = re.sub(r'\$[ ]*(\d+[,\.])*\d+', ' xxmon ', text)
    # put xxup token before words in all caps (easy way to recognize info from capitalizing a word)
    text = re.sub(r'(\b[A-Z][A-Z0-9]*\b)', r' xxup \1 ', text)
    # put xxcap token before words with capitalized first letter (easy way to recognize first word in a sentence)
    text = re.sub(r'(\b[A-Z][a-z0-9]+\b)', r' xxcap \1 ', text)
    # convert some common text "emojis" to xxemoji: ;), :), :(, :-(, etc
    text = re.sub(r'[:;][ ]*[-]*[ ]*[()]', ' xxemoji ', text)
    return text

def normalize_text(text):
    # converts common patterns into special tokens
    text = sub_special_tokens(text)
    # convert text to lowercase
    text = text.lower()
    # strip out any lingering html tags
    text = re.sub(r'<[^>]*>', '', text)
    # convert all common abrevs to regular word
    text = text.replace('&',' and ')
    text = re.sub(r'\bu\b', ' you ', text)
    text = re.sub(r'\bur\b', ' your ', text)
    text = re.sub(r'\b2\b', ' to ', text)
    text = re.sub(r'\b4\b', ' for ', text)
    # put spaces between punctuation (eg: 9.Blah -> 9 . Blah)
    puncts = r'[' + re.escape(string.punctuation) + r']'
    text = re.sub('(?<! )(?=' + puncts + ')|(?<=' + puncts + ')(?! )', r' ', text)
    # strip non-ascii characters (easy way to denoise text a bit)
    text = text.encode("ascii", errors="ignore").decode()
    # remove all punctuation except ?
    text = re.sub(r"[^\w\s?]",' xxpunct ',text)
    # convert all other numbers to xxnum token (e.g. 123, 1.2.3, 1-2-3 -> xxnum)
    text = re.sub(r'\b([.-]*[0-9]+[.-]*)+\b', ' xxnum ', text)
    # remove nltk's common set of stop words (common for classical NLP analysis)
    stop_words = stopwords.words('english')
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # stem words using nltk snowball stemmer, e.g. converts {run, running, runs} all to "run"
    stemmer = SnowballStemmer('english')
    stemmed_text = ''
    for word in text.split():
            stemmed_text = stemmed_text + stemmer.stem(word) + ' '
    text = stemmed_text
    # sub the occurance of 2 or more spaces with a single space
    text = re.sub(r'[ ]{2,}',' ',text)
    return text

In [10]:
df['text_processed'] = df['text'].apply(normalize_text)
df.head(10)

Unnamed: 0,labels,text,text_processed
0,0,Convey my regards to him,xxcap convey regard
1,0,"[Û_] anyway, many good evenings to u! s",xxpunct _ xxpunct anyway xxpunct mani good eve...
2,0,My sort code is and acc no is . The bank is n...,xxcap sort code acc xxpunct xxcap bank natwest...
3,0,Sorry i din lock my keypad.,xxcap sorri din lock keypad xxpunct
4,1,"Hi babe its Chloe, how r u? I was smashed on s...",xxcap hi babe xxcap chloe xxpunct r ? xxup sma...
5,0,Ok i thk i got it. Then u wan me 2 come now or...,xxcap ok thk got xxpunct xxcap wan come wat ?
6,0,Oi when you gonna ring,xxcap oi gonna ring
7,0,Will be office around 4 pm. Now i am going hos...,xxcap offic around xxmon xxpunct xxpunct xxpun...
8,0,Have you heard about that job? I'm going to th...,xxcap heard job ? xxup xxpunct go wildlif talk...
9,0,Oh my God. I'm almost home,xxcap oh xxcap god xxpunct xxup xxpunct almost...


In [11]:
nlp = spacy.load('en_core_web_sm')

def tokenize(text, nlp):
    doc = nlp(text)
    tokens = ' '.join(token.text for token in doc)
    return tokens

In [15]:
df['text_processed'] = df['text_processed'].apply(lambda x: tokenize(x, nlp))

In [21]:
df.to_csv(datapath/'spam_processed.csv', index=False)

In [35]:
def spampath():
    # gets path to resources folder
    return Path().cwd().parent / 'spam_data'

spampath = spampath()

In [26]:
train, validate, test = np.split(df.sample(frac=1, random_state=1), [int(0.6*len(df)), int(0.8*len(df))])

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3343 entries, 1078 to 425
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   labels          3343 non-null   int64 
 1   text            3343 non-null   object
 2   text_processed  3343 non-null   object
dtypes: int64(1), object(2)
memory usage: 104.5+ KB


In [37]:
train.to_csv(spampath/'train.csv', index=False)

In [38]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1114 entries, 697 to 371
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   labels          1114 non-null   int64 
 1   text            1114 non-null   object
 2   text_processed  1114 non-null   object
dtypes: int64(1), object(2)
memory usage: 34.8+ KB


In [39]:
validate.to_csv(spampath/'dev.csv', index=False)

In [40]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1115 entries, 3482 to 5157
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   labels          1115 non-null   int64 
 1   text            1115 non-null   object
 2   text_processed  1115 non-null   object
dtypes: int64(1), object(2)
memory usage: 34.8+ KB


In [41]:
test.to_csv(spampath/'test.csv', index=False)

## _Embeddings_

In [46]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentRNNEmbeddings, Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward])

In [50]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

data_folder = spampath

column_name_map = {2: 'text', 0: 'label_topic'}

corpus = CSVClassificationCorpus(
    data_folder,
    column_name_map,
    skip_header=True,
    delimiter=','
)

2020-05-21 20:33:00,079 Reading data from /notebooks/learning/spam_data
2020-05-21 20:33:00,080 Train: /notebooks/learning/spam_data/train.csv
2020-05-21 20:33:00,081 Dev: /notebooks/learning/spam_data/dev.csv
2020-05-21 20:33:00,081 Test: /notebooks/learning/spam_data/test.csv


In [51]:
# create the label dictionary
label_dict = corpus.make_label_dictionary()

2020-05-21 20:33:01,423 Computing label dictionary. Progress:


100%|██████████| 3343/3343 [00:00<00:00, 3625.74it/s]

2020-05-21 20:33:02,631 [b'0', b'1']





In [52]:
# make a list of word embeddings
word_embeddings = [WordEmbeddings('glove')]

# init document embedding 
document_embeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256
)

In [53]:
# create text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [54]:
# init text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# start the training
trainer.train(spampath, max_epochs=10)

2020-05-21 20:36:14,533 ----------------------------------------------------------------------------------------------------
2020-05-21 20:36:14,534 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): GRU(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-05-21 20:36:14,535 ----------------------------------------------------------------------------------------------------
2020-05-21 20:36:14,536 Corpus: "Corpus: 3343 train + 1114 dev + 1115 test sentences"
2020-05-21 20:36:14,536 ----------------------------------------------------------------------------------------------------
2020-05-21 20:36

2020-05-21 20:37:14,200 EPOCH 5 done: loss 0.1506 - lr 0.1000
2020-05-21 20:37:16,064 DEV : loss 0.17742173373699188 - score 0.9434
2020-05-21 20:37:16,624 BAD EPOCHS (no improvement): 1
2020-05-21 20:37:16,625 ----------------------------------------------------------------------------------------------------
2020-05-21 20:37:17,532 epoch 6 - iter 10/105 - loss 0.10164637 - samples/sec: 594.21
2020-05-21 20:37:18,062 epoch 6 - iter 20/105 - loss 0.12046549 - samples/sec: 659.01
2020-05-21 20:37:18,609 epoch 6 - iter 30/105 - loss 0.13060856 - samples/sec: 608.70
2020-05-21 20:37:19,207 epoch 6 - iter 40/105 - loss 0.12946705 - samples/sec: 566.98
2020-05-21 20:37:19,832 epoch 6 - iter 50/105 - loss 0.14134797 - samples/sec: 545.76
2020-05-21 20:37:20,406 epoch 6 - iter 60/105 - loss 0.14396010 - samples/sec: 582.32
2020-05-21 20:37:20,900 epoch 6 - iter 70/105 - loss 0.14024993 - samples/sec: 698.36
2020-05-21 20:37:22,862 epoch 6 - iter 80/105 - loss 0.13855941 - samples/sec: 635.47


{'test_score': 0.9677,
 'dev_score_history': [0.8716,
  0.8914,
  0.9452,
  0.9551,
  0.9434,
  0.9551,
  0.9417,
  0.9551,
  0.9578,
  0.9623],
 'train_loss_history': [0.3271535288719904,
  0.2627847296851022,
  0.22043548237000193,
  0.1896901235871372,
  0.15060285879742533,
  0.1360213768801519,
  0.14464515138949666,
  0.13471248827519872,
  0.12050398377080758,
  0.11418782712093421],
 'dev_loss_history': [tensor(0.3075, device='cuda:0'),
  tensor(0.2576, device='cuda:0'),
  tensor(0.1767, device='cuda:0'),
  tensor(0.1368, device='cuda:0'),
  tensor(0.1774, device='cuda:0'),
  tensor(0.1312, device='cuda:0'),
  tensor(0.1518, device='cuda:0'),
  tensor(0.1308, device='cuda:0'),
  tensor(0.1163, device='cuda:0'),
  tensor(0.1162, device='cuda:0')]}

In [60]:
from flair.models import TextClassifier

classifier = TextClassifier.load(spampath/'best-model.pt')

2020-05-21 20:50:10,683 loading file /notebooks/learning/spam_data/best-model.pt


In [63]:
sentence = Sentence('Want to earn some money!?!')

classifier.predict(sentence)

print(sentence.labels)

[0 (0.9950962662696838)]
