# 1. Objective

To illustrate the helper functions used for tokenization, normalization, stemming and lemmatization

## Imports

In [1]:
import nltk
import re

import pandas as pd

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

from nltk.corpus import stopwords

## Data

In [2]:
data_file = 'DSP453_ClassCorpus.csv'

In [3]:
class_corpus = pd.read_csv(data_file)

In [4]:
class_corpus.shape

(190, 6)

In [5]:
class_corpus.head(10)

Unnamed: 0,FileName,StudentName,Genre,ReviewType,MovieTitle,MovieReview
0,KHS_Doc1_TheRing,KHS,Horror,Negative,The Ring,"Described as a drama, horror and mystery, The ..."
1,KHS_Doc2_TheRing,KHS,Horror,Negative,The Ring,"I hated it, but I grant that it does tap into..."
2,KHS_Doc3_TheRing,KHS,Horror,Negative,The Ring,Ehren Kruger (no relation to Freddy) showed a ...
3,KHS_Doc4_TheRing,KHS,Horror,Negative,The Ring,Not all arthouse thrillers are destined to be ...
4,KHS_Doc5_TheRing,KHS,Horror,Negative,The Ring,Rarely has a more serious effort produced a le...
5,KHS_Doc6_TheRing,KHS,Horror,Positive,The Ring,The opening segment ends on an unsettling note...
6,KHS_Doc7_TheRing,KHS,Horror,Positive,The Ring,A big-budget American remake of a Japanese sen...
7,KHS_Doc8_TheRing,KHS,Horror,Positive,The Ring,"Like most horror movies, ""The Ring"" is somethi..."
8,KHS_Doc9_TheRing,KHS,Horror,Positive,The Ring,Paul Schrader turns Auto Focus into an anti-se...
9,KHS_Doc10_TheRing,KHS,Horror,Positive,The Ring,"Gore Verbinskis The Ring , from a screenplay b..."


In [6]:
sample_review_text = class_corpus['MovieReview'][9]

In [7]:
sample_review_text[0:100]

'Gore Verbinskis The Ring , from a screenplay by Ehren Kruger, is not some Wagnerian operatic film, a'

# 2. Normalization

In [8]:
stop_words = set(nltk.corpus.stopwords.words('english'))

In [9]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in stop_words]


In [10]:
normalized_sample_text = remove_punctuation(sample_review_text)
normalized_sample_text = lower_case(normalized_sample_text)
normalized_sample_text = remove_tags(normalized_sample_text)
normalized_sample_text = remove_special_chars_and_digits(normalized_sample_text)

In [11]:
normalized_sample_text[0:100]

'gore verbinskis the ring from a screenplay by ehren kruger is not some wagnerian operatic film as it'

In [12]:
normalized_sample_tokens = nltk.word_tokenize(normalized_sample_text)

In [13]:
normalized_sample_tokens[0:15]

['gore',
 'verbinskis',
 'the',
 'ring',
 'from',
 'a',
 'screenplay',
 'by',
 'ehren',
 'kruger',
 'is',
 'not',
 'some',
 'wagnerian',
 'operatic']

In [14]:
normalized_sample_tokens = remove_stop_words(normalized_sample_tokens)

# 3. Lemmatization 

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
for word in normalized_sample_tokens[0:15]:
    lemmatized_word = lemmatizer.lemmatize(word)
    print(word + ' | ' + lemmatized_word)

gore | gore
verbinskis | verbinskis
ring | ring
screenplay | screenplay
ehren | ehren
kruger | kruger
wagnerian | wagnerian
operatic | operatic
film | film
title | title
might | might
suggest | suggest
rather | rather
remake | remake
box | box


# 4. Stemming

In [17]:
stemmer = PorterStemmer()

In [18]:
for word in normalized_sample_tokens[0:15]:
    stemmed_word = stemmer.stem(word)
    print(word + ' | ' + stemmed_word)

gore | gore
verbinskis | verbinski
ring | ring
screenplay | screenplay
ehren | ehren
kruger | kruger
wagnerian | wagnerian
operatic | operat
film | film
title | titl
might | might
suggest | suggest
rather | rather
remake | remak
box | box


**Question:** Should we do lemmatization and stemming both? If so, is there any order in which these should be executed?

In [19]:
for word_ in ['run', 'runs', 'running', 'ran']:
    print(lemmatizer.lemmatize(word_))

run
run
running
ran


In [20]:
for word_ in ['run', 'runs', 'running', 'ran', 'random']:
    print(stemmer.stem(word_))

run
run
run
ran
random


# Reading
[On lemmatization and stemming](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html#:~:text=Lemmatization%20usually%20refers%20to%20doing,is%20known%20as%20the%20lemma%20.)