# Classical NLP

## Import packages

In [120]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('omw-1.4')

## Load data

In [121]:
url = 'https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'

In [122]:
data = pd.read_csv(url, encoding='latin1', usecols=['v1','v2'])
data.columns = ['label','raw_text']
data['text'] = data.raw_text

In [123]:
data.sample(10)

Unnamed: 0,label,raw_text,text
4119,ham,Babe! How goes that day ? What are you up to ?...,Babe! How goes that day ? What are you up to ?...
557,ham,I know that my friend already told that.,I know that my friend already told that.
233,ham,Yes:)here tv is always available in work place..,Yes:)here tv is always available in work place..
2736,ham,Really? I crashed out cuddled on my sofa.,Really? I crashed out cuddled on my sofa.
1873,spam,You have WON a guaranteed å£1000 cash or a å£2...,You have WON a guaranteed å£1000 cash or a å£2...
123,ham,I am going to sao mu today. Will be done only ...,I am going to sao mu today. Will be done only ...
5198,ham,She.s fine. I have had difficulties with her p...,She.s fine. I have had difficulties with her p...
3756,spam,YES! The only place in town to meet exciting a...,YES! The only place in town to meet exciting a...
4734,ham,Oh k:)after that placement there ah?,Oh k:)after that placement there ah?
482,ham,Watching tv lor...,Watching tv lor...


## Pre-processing steps

1. Remove punctuation
2. Remove capitalization
3. Remove stopwords
4. Tokenization
5. Stemming / Lemmatizing
6. Expressing text as numbers. (Computer can only work with numbers, you know...)

## Remove punctuation

In [124]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [125]:
data['text'] = data.text.apply(remove_punctuation)
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


## Remove capitalization
Let's set everything to lower case.
(Quersion: Why not upper case?)

In [126]:
data['text']= data['text'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


## Remove numbers

In [127]:
def remove_number(string):
    return(re.sub(r'[0-9]*','',string))

In [128]:
data['text'] = data.text.apply(lambda x: remove_number(x))
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


## Tokenization
* What is a token?
* Does token = word?
* grams, bigrams, trigrams and n-grams

In [129]:
def tokenization(text):
    tokens = re.split(' ',text)
    return tokens

In [130]:
data['text'] = data.text.apply(tokenization)

In [131]:
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, , a, wkly, comp, to, win, fa..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


## Remove stopwords
What is a stopword?

In [132]:
#Stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [133]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [134]:
data['text'] = data.text.apply(remove_stopwords)
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, , wkly, comp, win, fa, cup, fina..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


## Stemming / Lemmatizing
* What's the difference?
* Which one do you think is better?

Let's use lemmatizing:

In [135]:
#defining the function for lemmatization
def lemmatizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [136]:
for example in ['dogs','children','abaci']:
    print(example, '->' ,wordnet_lemmatizer.lemmatize(example))

dogs -> dog
children -> child
abaci -> abacus


In [137]:
data['text'] = data['text'].apply(lambda x:lemmatizer(x))
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, , wkly, comp, win, fa, cup, fina..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"


Note: In the last row, _goes_ has changed to _go_

## Expressing text as numbers

* Document-Term Matrix (DTM)
* Embeddings

Let's save embeddings for later. For now, let's dive into the DTM

There are two kinds of DTM:

* those based on _term frequency_, a.k.a. _Bag of Words_ (BOW)
* those based on _TF-IDF_

_Scikit-Learn_ has functions to build both kinds of DTMs, but it requires text (not a list of tokens). So let's join our tokens:

In [138]:
data['text'] = data.text.apply(lambda x: ' '.join(x))
data.head()

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


### DTMs based on term frequency (Bag of Words)

In [139]:
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(data.text)

In [140]:
dtm.shape

(5572, 7918)

In [141]:
print(dtm.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


The DTM s __sparse__.

In [142]:
def visualize(DTM):
    
    td = pd.DataFrame(DTM.todense()).iloc[:5]  
    td.columns = vectorizer.get_feature_names_out()
    term_document_matrix = td.T
    term_document_matrix.columns = ['Doc '+str(i) for i in range(1, 6)]
    term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)
    
    # Top 25 words 
    term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:20] 
    
    # Print the first 10 rows 
    print(term_document_matrix.drop(columns=['total_count']).head(20))
    
    pass

In [143]:
visualize(dtm)

             Doc 1  Doc 2  Doc 3  Doc 4  Doc 5
go               1      0      0      0      1
fa               0      0      2      0      0
say              0      0      0      2      0
entry            0      0      2      0      0
questionstd      0      0      1      0      0
think            0      0      0      0      1
wif              0      1      0      0      0
ok               0      1      0      0      0
apply            0      0      1      0      0
jurong           1      0      0      0      0
over             0      0      1      0      0
around           0      0      0      0      1
point            1      0      0      0      0
comp             0      0      1      0      0
dun              0      0      0      1      0
wkly             0      0      1      0      0
may              0      0      1      0      0
txt              0      0      1      0      0
life             0      0      0      0      1
lar              0      1      0      0      0


_Comments:_
* What does each row/column of the DTM represent?
* Why are there so many zeros?
    * Sparse matrix
    * Implications for storage in memory and computation
* Notice any problems with this DTM? Can we do any better?

### DTMs base on TF-IDFs

$$
TF-IDF = \frac{\text{Frequency of term } i \text{ in a document}}{\text{Number of words in that document}} \times \log_2\left(\frac{\text{Number of documents in corpus}}{\text{Number of documents that contain term } i}\right)
$$

* A measure of how much a term $i$ is important for a document $D$
* Building blocks:
    * Term Frequency
    * Document Frequency
    * Inverse Document Frequency
* What is the rationale behind TF-IDF?

In [144]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_DTM = tfidf_vectorizer.fit_transform(data.text)

In [145]:
tfidf_DTM.shape

(5572, 7918)

In [146]:
print(tfidf_DTM.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [147]:
visualize(tfidf_DTM)


            Doc 1     Doc 2     Doc 3     Doc 4     Doc 5
say      0.000000  0.000000  0.000000  0.601553  0.000000
oni      0.000000  0.546256  0.000000  0.000000  0.000000
hor      0.000000  0.000000  0.000000  0.523599  0.000000
joking   0.000000  0.523327  0.000000  0.000000  0.000000
fa       0.000000  0.000000  0.499030  0.000000  0.000000
nah      0.000000  0.000000  0.000000  0.000000  0.450358
usf      0.000000  0.000000  0.000000  0.000000  0.444936
wif      0.000000  0.431339  0.000000  0.000000  0.000000
lar      0.000000  0.408051  0.000000  0.000000  0.000000
go       0.155085  0.000000  0.000000  0.000000  0.246698
though   0.000000  0.000000  0.000000  0.000000  0.394408
entry    0.000000  0.000000  0.382427  0.000000  0.000000
early    0.000000  0.000000  0.000000  0.378247  0.000000
jurong   0.349890  0.000000  0.000000  0.000000  0.000000
amore    0.349890  0.000000  0.000000  0.000000  0.000000
dun      0.000000  0.000000  0.000000  0.348435  0.000000
around   0.000

## All-in-One

In [148]:
def preprocess(data):

    data['text'] = data.raw_text
    data['text'] = data.text.apply(remove_punctuation)
    data['text']= data['text'].apply(lambda x: x.lower())
    data['text'] = data.text.apply(lambda x: remove_number(x))
    data['text'] = data.text.apply(tokenization)
    data['text'] = data.text.apply(remove_stopwords)
    data['text'] = data['text'].apply(lambda x:lemmatizer(x))
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    
    return(data)

In [149]:
preprocess(data)

Unnamed: 0,label,raw_text,text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u å£ pound prize cla...
5568,ham,Will Ì_ b going to esplanade fr home?,ì b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...
