In [1]:
import nltk
import re

import pandas as pd
import numpy as np

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords

In [2]:
def add_movie_descriptor(data: pd.DataFrame, corpus_df: pd.DataFrame):
    """
    Adds "Movie Description" to the supplied dataframe, in the form {Genre}_{P|N}_{Movie Title}_{DocID}
    """
    review = np.where(corpus_df['Review Type (pos or neg)'] == 'Positive', 'P', 'N')
    data['Descriptor'] = corpus_df['Genre of Movie'] + '_' + corpus_df['Movie Title'] + '_' + review + '_' + corpus_df['Doc_ID'].astype(str)

def get_corpus_df(path):
    data = pd.read_csv(path, encoding="utf-8")
    add_movie_descriptor(data, data)
    sorted_data = data.sort_values(['Descriptor'])
    indexed_data = sorted_data.set_index(['Doc_ID'])
    indexed_data['Doc_ID'] = indexed_data.index
    return indexed_data

# Data

In [3]:
CORPUS_PATH=\
'https://raw.githubusercontent.com/djp840/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_QA_20220906.csv'
class_corpus= get_corpus_df(CORPUS_PATH)

In [4]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 40 to 199
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DSI_Title                 200 non-null    object
 1   Text                      200 non-null    object
 2   Submission File Name      200 non-null    object
 3   Student Name              200 non-null    object
 4   Genre of Movie            200 non-null    object
 5   Review Type (pos or neg)  200 non-null    object
 6   Movie Title               200 non-null    object
 7   Descriptor                200 non-null    object
 8   Doc_ID                    200 non-null    int64 
dtypes: int64(1), object(8)
memory usage: 15.6+ KB


# Data Wrangling

To conduct data wrangling using `nltk` we need the following additional modules to be downloaded.

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jensen116/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

## Step 1: Pre-processing

Below is a set of helper functions that help us format the text of the reviews. 

In [7]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

Let us now look at the impact of applying these functions on our reviews text.

In [8]:
sample_review_text = class_corpus['Text'][9]

In [9]:
sample_review_text[0:100]

'Liam Neeson was up for\x97but never got to play\x97James Bond, and now he\x92s getting even. In the violent, '

In [10]:
normalized_sample_text = remove_punctuation(sample_review_text)
normalized_sample_text = lower_case(normalized_sample_text)
normalized_sample_text = remove_tags(normalized_sample_text)
normalized_sample_text = remove_special_chars_and_digits(normalized_sample_text)

In [11]:
normalized_sample_text[0:100]

'liam neeson was up for but never got to play james bond and now he s getting even in the violent chu'

Once the text is free of punctuation, tags and special characters we tokenize the text (i.e., split the text at white spaces). There is a built-in `word_tokenize` function from `nltk` that helps achieve this task.

In [12]:
normalized_sample_tokens = nltk.word_tokenize(normalized_sample_text)

In [13]:
normalized_sample_tokens[0:15]

['liam',
 'neeson',
 'was',
 'up',
 'for',
 'but',
 'never',
 'got',
 'to',
 'play',
 'james',
 'bond',
 'and',
 'now',
 'he']

A crucial step after word tokenization is to remove stop words from the tokens generated. 

In [14]:
normalized_sample_tokens = remove_stop_words(normalized_sample_tokens)

In [15]:
normalized_sample_tokens[0:15]

['liam',
 'neeson',
 'never',
 'got',
 'play',
 'james',
 'bond',
 'getting',
 'even',
 'violent',
 'churning',
 'laughably',
 'derivative',
 'action',
 'bruiser']

## Step 2: Lemmatization

Once we have the tokens free of stop words, we then consider lemmatization. This helps bring variations of the words to a common base (i.e., the lemma). 

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
for word in normalized_sample_tokens[0:15]:
    lemmatized_word = lemmatizer.lemmatize(word)
    print(word + ' | ' + lemmatized_word)

liam | liam
neeson | neeson
never | never
got | got
play | play
james | james
bond | bond
getting | getting
even | even
violent | violent
churning | churning
laughably | laughably
derivative | derivative
action | action
bruiser | bruiser


Detail on [lemmatization and stemming](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html#:~:text=Lemmatization%20usually%20refers%20to%20doing,is%20known%20as%20the%20lemma%20.)

In [18]:
def apply_lemmatization(tokenized_text):
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [19]:
apply_lemmatization(normalized_sample_tokens)[0:25]

['liam',
 'neeson',
 'never',
 'got',
 'play',
 'james',
 'bond',
 'getting',
 'even',
 'violent',
 'churning',
 'laughably',
 'derivative',
 'action',
 'bruiser',
 'taken',
 'suave',
 'power',
 'knuckled',
 'lethal',
 'secret',
 'agent',
 'named',
 'bryan',
 'mill']

# Important - Prevalent Terms

Exploratory data analysis on text involves getting a sense of tokens that you feel are important (i.e., they represent the *intent* of the corpus) and are prevalent (i.e., they are abundantly represented in the corpus).

This is a qualitative exercise. Judgement of what tokens are *important* rests with the analyst.

Let us apply all the wrangling steps detailed earlier in this notebook to bring the data to a form where we can arrive at important-prevalent terms.

Note that we are about to execute a specific set of data wrangling steps to generate tokens from input text. Changing one or more of the steps followed at this stage will alter the number and variety of the tokens generated.

In [20]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [21]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [22]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [23]:
class_corpus.head()

Unnamed: 0_level_0,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,Descriptor,Doc_ID
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40,KCM_Doc1_AngelHasFallen,Boredom sets in long before the start of Angel...,KCM_Doc1_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_40,40
41,KCM_Doc2_AngelHasFallen,\nWho ARE all these people?\n\nThat was what...,KCM_Doc2_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_41,41
42,KCM_Doc3_AngelHasFallen,Ric Roman Waughs Angel Has Fallen sees U.S. S...,KCM_Doc3_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_42,42
43,KCM_Doc4_AngelHasFallen,There is a certain mindless pleasure in the Fa...,KCM_Doc4_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_43,43
44,KCM_Doc5_AngelHasFallen,"No, you dont need to have seen Olympus Has F...",KCM_Doc5_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_44,44


In [24]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)

In [25]:
class_corpus.head()

Unnamed: 0_level_0,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,Descriptor,Doc_ID,normalized_review
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40,KCM_Doc1_AngelHasFallen,Boredom sets in long before the start of Angel...,KCM_Doc1_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_40,40,boredom sets in long before the start of angel...
41,KCM_Doc2_AngelHasFallen,\nWho ARE all these people?\n\nThat was what...,KCM_Doc2_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_41,41,who are all these people that was what i mutt...
42,KCM_Doc3_AngelHasFallen,Ric Roman Waughs Angel Has Fallen sees U.S. S...,KCM_Doc3_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_42,42,ric roman waugh s angel has fallen sees u s se...
43,KCM_Doc4_AngelHasFallen,There is a certain mindless pleasure in the Fa...,KCM_Doc4_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_43,43,there is a certain mindless pleasure in the fa...
44,KCM_Doc5_AngelHasFallen,"No, you dont need to have seen Olympus Has F...",KCM_Doc5_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_44,44,no you don t need to have seen olympus has fal...


In [26]:
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)

In [27]:
class_corpus.head()

Unnamed: 0_level_0,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,Descriptor,Doc_ID,normalized_review,tokenized_review
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
40,KCM_Doc1_AngelHasFallen,Boredom sets in long before the start of Angel...,KCM_Doc1_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_40,40,boredom sets in long before the start of angel...,"[boredom, sets, long, start, angel, fallen, st..."
41,KCM_Doc2_AngelHasFallen,\nWho ARE all these people?\n\nThat was what...,KCM_Doc2_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_41,41,who are all these people that was what i mutt...,"[people, muttered, entered, first, screening, ..."
42,KCM_Doc3_AngelHasFallen,Ric Roman Waughs Angel Has Fallen sees U.S. S...,KCM_Doc3_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_42,42,ric roman waugh s angel has fallen sees u s se...,"[ric, roman, waugh, angel, fallen, sees, u, se..."
43,KCM_Doc4_AngelHasFallen,There is a certain mindless pleasure in the Fa...,KCM_Doc4_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_43,43,there is a certain mindless pleasure in the fa...,"[certain, mindless, pleasure, fallen, movies, ..."
44,KCM_Doc5_AngelHasFallen,"No, you dont need to have seen Olympus Has F...",KCM_Doc5_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_44,44,no you don t need to have seen olympus has fal...,"[need, seen, olympus, fallen, london, fallen, ..."


In [28]:
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)

In [29]:
class_corpus.head()

Unnamed: 0_level_0,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,Descriptor,Doc_ID,normalized_review,tokenized_review,lemmatized_tokens
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
40,KCM_Doc1_AngelHasFallen,Boredom sets in long before the start of Angel...,KCM_Doc1_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_40,40,boredom sets in long before the start of angel...,"[boredom, sets, long, start, angel, fallen, st...","[boredom, set, long, start, angel, fallen, sta..."
41,KCM_Doc2_AngelHasFallen,\nWho ARE all these people?\n\nThat was what...,KCM_Doc2_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_41,41,who are all these people that was what i mutt...,"[people, muttered, entered, first, screening, ...","[people, muttered, entered, first, screening, ..."
42,KCM_Doc3_AngelHasFallen,Ric Roman Waughs Angel Has Fallen sees U.S. S...,KCM_Doc3_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_42,42,ric roman waugh s angel has fallen sees u s se...,"[ric, roman, waugh, angel, fallen, sees, u, se...","[ric, roman, waugh, angel, fallen, see, u, sec..."
43,KCM_Doc4_AngelHasFallen,There is a certain mindless pleasure in the Fa...,KCM_Doc4_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_43,43,there is a certain mindless pleasure in the fa...,"[certain, mindless, pleasure, fallen, movies, ...","[certain, mindless, pleasure, fallen, movie, w..."
44,KCM_Doc5_AngelHasFallen,"No, you dont need to have seen Olympus Has F...",KCM_Doc5_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_44,44,no you don t need to have seen olympus has fal...,"[need, seen, olympus, fallen, london, fallen, ...","[need, seen, olympus, fallen, london, fallen, ..."


To count the occurrences of each token across the corpus we use the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) from scikit-learn.

To be able to use the count vectorizer, we need the corpus to be a list of sentences. Let us apply this to the lemmatized review column in the class corpus.

In [30]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [31]:
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [32]:
class_corpus.head()

Unnamed: 0_level_0,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,Descriptor,Doc_ID,normalized_review,tokenized_review,lemmatized_tokens,lemmatized_text
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
40,KCM_Doc1_AngelHasFallen,Boredom sets in long before the start of Angel...,KCM_Doc1_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_40,40,boredom sets in long before the start of angel...,"[boredom, sets, long, start, angel, fallen, st...","[boredom, set, long, start, angel, fallen, sta...",boredom set long start angel fallen start jour...
41,KCM_Doc2_AngelHasFallen,\nWho ARE all these people?\n\nThat was what...,KCM_Doc2_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_41,41,who are all these people that was what i mutt...,"[people, muttered, entered, first, screening, ...","[people, muttered, entered, first, screening, ...",people muttered entered first screening angel ...
42,KCM_Doc3_AngelHasFallen,Ric Roman Waughs Angel Has Fallen sees U.S. S...,KCM_Doc3_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_42,42,ric roman waugh s angel has fallen sees u s se...,"[ric, roman, waugh, angel, fallen, sees, u, se...","[ric, roman, waugh, angel, fallen, see, u, sec...",ric roman waugh angel fallen see u secret serv...
43,KCM_Doc4_AngelHasFallen,There is a certain mindless pleasure in the Fa...,KCM_Doc4_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_43,43,there is a certain mindless pleasure in the fa...,"[certain, mindless, pleasure, fallen, movies, ...","[certain, mindless, pleasure, fallen, movie, w...",certain mindless pleasure fallen movie watchin...
44,KCM_Doc5_AngelHasFallen,"No, you dont need to have seen Olympus Has F...",KCM_Doc5_AngelHasFallen,KCM,Action,Negative,Angel Has Fallen,Action_Angel Has Fallen_N_44,44,no you don t need to have seen olympus has fal...,"[need, seen, olympus, fallen, london, fallen, ...","[need, seen, olympus, fallen, london, fallen, ...",need seen olympus fallen london fallen underst...


In [33]:
count_vectorizer = CountVectorizer()

In [34]:
dtm_class_corpus = count_vectorizer.fit_transform(class_corpus['lemmatized_text'])

In [35]:
word_counts_class_corpus = pd.DataFrame(dtm_class_corpus.toarray(), 
                                        columns=count_vectorizer.get_feature_names_out(), 
                                        index=class_corpus.index)

In [36]:
word_counts_class_corpus.head()

Unnamed: 0_level_0,aaron,ab,abandon,abandoned,abandoning,abandonment,abducted,abduction,abductor,abducts,...,zip,zippier,zipping,zippy,zoey,zombie,zone,zonk,zoom,zora
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
(word_counts_class_corpus.sum()
                         .sort_values(ascending=False)
                         .head(25))

film         1005
movie         804
one           614
like          495
time          427
get           355
make          353
character     327
even          309
scene         289
much          264
way           256
action        256
would         250
also          232
thing         231
first         225
year          220
come          220
take          218
well          213
story         213
good          210
see           205
go            205
dtype: int64

Based on these prevalent terms, we will need to take a call on which of these are also important in the context of the movie reviews.

# Summary

Data wrangling in the context of text as input comes down to a two stage process:
1. Pre-processing (or *Normalizing*) the text by removing punctuation, tags & special characters, lower casing the words and removing the stop words. The end result at this point is a list of tokens for each review.
2. *Lemmatization* applied on the normalized list of tokens to bring them down to a common lemma. 