In [1]:
! pip install session_info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting session_info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting stdlib_list
  Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 938 kB/s 
[?25hBuilding wheels for collected packages: session-info
  Building wheel for session-info (setup.py) ... [?25l[?25hdone
  Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8048 sha256=cfbcc5f40fbe81469c7296158ac4d3dc73147b0fe76711375418f67e97ee88e2
  Stored in directory: /root/.cache/pip/wheels/bd/ad/14/6a42359351a18337a8683854cfbba99dd782271f2d1767f87f
Successfully built session-info
Installing collected packages: stdlib-list, session-info
Successfully installed session-info-1.0.0 stdlib-list-0.8.0


In [2]:
import nltk
import re
import session_info

import pandas as pd

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords

In [3]:
session_info.show()

# Data

In [4]:
data_file = 'DSP453_ClassCorpus_v1.csv'

In [5]:
class_corpus = pd.read_csv(data_file)

In [6]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Doc_ID                    100 non-null    int64 
 1   DSI_Title                 100 non-null    object
 2   Text                      100 non-null    object
 3   Submission File Name      100 non-null    object
 4   Student Name              100 non-null    object
 5   Genre of Movie            100 non-null    object
 6   Review Type (pos or neg)  100 non-null    object
 7   Movie Title               100 non-null    object
dtypes: int64(1), object(7)
memory usage: 6.4+ KB


# Data Wrangling

To conduct data wrangling using `nltk` we need the following additional modules to be downloaded.

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

## Step 1: Pre-processing

Below is a set of helper functions that help us format the text of the reviews. 

In [9]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

Let us now look at the impact of applying these functions on our reviews text.

In [13]:
sample_review_text = class_corpus['Text'][9]

In [14]:
sample_review_text[0:100]

'"While not a perfect movie by any means, there were plenty of funny moments to make this enjoyable. '

In [15]:
normalized_sample_text = remove_punctuation(sample_review_text)
normalized_sample_text = lower_case(normalized_sample_text)
normalized_sample_text = remove_tags(normalized_sample_text)
normalized_sample_text = remove_special_chars_and_digits(normalized_sample_text)

In [16]:
normalized_sample_text[0:100]

' while not a perfect movie by any means there were plenty of funny moments to make this enjoyable it'

Once the text is free of punctuation, tags and special characters we tokenize the text (i.e., split the text at white spaces). There is a built-in `word_tokenize` function from `nltk` that helps achieve this task.

In [17]:
normalized_sample_tokens = nltk.word_tokenize(normalized_sample_text)

In [18]:
normalized_sample_tokens[0:15]

['while',
 'not',
 'a',
 'perfect',
 'movie',
 'by',
 'any',
 'means',
 'there',
 'were',
 'plenty',
 'of',
 'funny',
 'moments',
 'to']

A crucial step after word tokenization is to remove stop words from the tokens generated. 

In [19]:
normalized_sample_tokens = remove_stop_words(normalized_sample_tokens)

In [20]:
normalized_sample_tokens[0:15]

['perfect',
 'movie',
 'means',
 'plenty',
 'funny',
 'moments',
 'make',
 'enjoyable',
 'suppose',
 'everyone',
 'needs',
 'relax',
 'enjoy',
 'life',
 'hilarious']

## Step 2: Lemmatization

Once we have the tokens free of stop words, we then consider lemmatization. This helps bring variations of the words to a common base (i.e., the lemma). 

In [21]:
lemmatizer = WordNetLemmatizer()

In [22]:
for word in normalized_sample_tokens[0:15]:
    lemmatized_word = lemmatizer.lemmatize(word)
    print(word + ' | ' + lemmatized_word)

perfect | perfect
movie | movie
means | mean
plenty | plenty
funny | funny
moments | moment
make | make
enjoyable | enjoyable
suppose | suppose
everyone | everyone
needs | need
relax | relax
enjoy | enjoy
life | life
hilarious | hilarious


Detail on [lemmatization and stemming](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html#:~:text=Lemmatization%20usually%20refers%20to%20doing,is%20known%20as%20the%20lemma%20.)

In [23]:
def apply_lemmatization(tokenized_text):
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [24]:
apply_lemmatization(normalized_sample_tokens)[0:25]

['perfect',
 'movie',
 'mean',
 'plenty',
 'funny',
 'moment',
 'make',
 'enjoyable',
 'suppose',
 'everyone',
 'need',
 'relax',
 'enjoy',
 'life',
 'hilarious',
 'laughing',
 'entire',
 'time',
 'definitely',
 'tearing',
 'really',
 'needed',
 'great',
 'laugh',
 'movie']

# Important - Prevalent Terms

Exploratory data analysis on text involves getting a sense of tokens that you feel are important (i.e., they represent the *intent* of the corpus) and are prevalent (i.e., they are abundantly represented in the corpus).

This is a qualitative exercise. Judgement of what tokens are *important* rests with the analyst.

Let us apply all the wrangling steps detailed earlier in this notebook to bring the data to a form where we can arrive at important-prevalent terms.

Note that we are about to execute a specific set of data wrangling steps to generate tokens from input text. Changing one or more of the steps followed at this stage will alter the number and variety of the tokens generated.

In [25]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [26]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [27]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [28]:
class_corpus.head()

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title
0,101,HBW_Doc1_HolmesAndWatson,"""Holmes and Watson review: a lumbering Sherloc...",HBW_Doc1_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson
1,102,HBW_Doc2_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: No, Sh-t Sherlo...",HBW_Doc2_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson
2,103,HBW_Doc3_HolmesAndWatson,"""It is often said that Sherlock Holmes, the le...",HBW_Doc3_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson
3,104,HBW_Doc4_HolmesAndWatson,"""Holmes & Watson wasnï¿½t shown at all to the ...",HBW_Doc4_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson
4,105,HBW_Doc5_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: Will Ferrell an...",HBW_Doc5_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson


In [30]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)

In [31]:
class_corpus.head()

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,normalized_review
0,101,HBW_Doc1_HolmesAndWatson,"""Holmes and Watson review: a lumbering Sherloc...",HBW_Doc1_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes and watson review a lumbering sherlock...
1,102,HBW_Doc2_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: No, Sh-t Sherlo...",HBW_Doc2_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review no sh t sherlock this wi...
2,103,HBW_Doc3_HolmesAndWatson,"""It is often said that Sherlock Holmes, the le...",HBW_Doc3_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,it is often said that sherlock holmes the leg...
3,104,HBW_Doc4_HolmesAndWatson,"""Holmes & Watson wasnï¿½t shown at all to the ...",HBW_Doc4_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson wasn t shown at all to the pres...
4,105,HBW_Doc5_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: Will Ferrell an...",HBW_Doc5_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review will ferrell and john c ...


In [32]:
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)

In [33]:
class_corpus.head()

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,normalized_review,tokenized_review
0,101,HBW_Doc1_HolmesAndWatson,"""Holmes and Watson review: a lumbering Sherloc...",HBW_Doc1_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes and watson review a lumbering sherlock...,"[holmes, watson, review, lumbering, sherlockia..."
1,102,HBW_Doc2_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: No, Sh-t Sherlo...",HBW_Doc2_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review no sh t sherlock this wi...,"[holmes, watson, review, sh, sherlock, ferrell..."
2,103,HBW_Doc3_HolmesAndWatson,"""It is often said that Sherlock Holmes, the le...",HBW_Doc3_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,it is often said that sherlock holmes the leg...,"[often, said, sherlock, holmes, legendary, det..."
3,104,HBW_Doc4_HolmesAndWatson,"""Holmes & Watson wasnï¿½t shown at all to the ...",HBW_Doc4_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson wasn t shown at all to the pres...,"[holmes, watson, shown, press, advance, releas..."
4,105,HBW_Doc5_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: Will Ferrell an...",HBW_Doc5_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review will ferrell and john c ...,"[holmes, watson, review, ferrell, john, c, rei..."


In [34]:
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)

In [35]:
class_corpus.head()

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,normalized_review,tokenized_review,lemmatized_tokens
0,101,HBW_Doc1_HolmesAndWatson,"""Holmes and Watson review: a lumbering Sherloc...",HBW_Doc1_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes and watson review a lumbering sherlock...,"[holmes, watson, review, lumbering, sherlockia...","[holmes, watson, review, lumbering, sherlockia..."
1,102,HBW_Doc2_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: No, Sh-t Sherlo...",HBW_Doc2_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review no sh t sherlock this wi...,"[holmes, watson, review, sh, sherlock, ferrell...","[holmes, watson, review, sh, sherlock, ferrell..."
2,103,HBW_Doc3_HolmesAndWatson,"""It is often said that Sherlock Holmes, the le...",HBW_Doc3_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,it is often said that sherlock holmes the leg...,"[often, said, sherlock, holmes, legendary, det...","[often, said, sherlock, holmes, legendary, det..."
3,104,HBW_Doc4_HolmesAndWatson,"""Holmes & Watson wasnï¿½t shown at all to the ...",HBW_Doc4_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson wasn t shown at all to the pres...,"[holmes, watson, shown, press, advance, releas...","[holmes, watson, shown, press, advance, releas..."
4,105,HBW_Doc5_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: Will Ferrell an...",HBW_Doc5_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review will ferrell and john c ...,"[holmes, watson, review, ferrell, john, c, rei...","[holmes, watson, review, ferrell, john, c, rei..."


To count the occurrences of each token across the corpus we use the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) from scikit-learn.

To be able to use the count vectorizer, we need the corpus to be a list of sentences. Let us apply this to the lemmatized review column in the class corpus.

In [36]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [37]:
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [38]:
class_corpus.head()

Unnamed: 0,Doc_ID,DSI_Title,Text,Submission File Name,Student Name,Genre of Movie,Review Type (pos or neg),Movie Title,normalized_review,tokenized_review,lemmatized_tokens,lemmatized_text
0,101,HBW_Doc1_HolmesAndWatson,"""Holmes and Watson review: a lumbering Sherloc...",HBW_Doc1_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes and watson review a lumbering sherlock...,"[holmes, watson, review, lumbering, sherlockia...","[holmes, watson, review, lumbering, sherlockia...",holmes watson review lumbering sherlockian kno...
1,102,HBW_Doc2_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: No, Sh-t Sherlo...",HBW_Doc2_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review no sh t sherlock this wi...,"[holmes, watson, review, sh, sherlock, ferrell...","[holmes, watson, review, sh, sherlock, ferrell...",holmes watson review sh sherlock ferrell john ...
2,103,HBW_Doc3_HolmesAndWatson,"""It is often said that Sherlock Holmes, the le...",HBW_Doc3_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,it is often said that sherlock holmes the leg...,"[often, said, sherlock, holmes, legendary, det...","[often, said, sherlock, holmes, legendary, det...",often said sherlock holmes legendary detective...
3,104,HBW_Doc4_HolmesAndWatson,"""Holmes & Watson wasnï¿½t shown at all to the ...",HBW_Doc4_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson wasn t shown at all to the pres...,"[holmes, watson, shown, press, advance, releas...","[holmes, watson, shown, press, advance, releas...",holmes watson shown press advance release one ...
4,105,HBW_Doc5_HolmesAndWatson,"""ï¿½Holmes & Watsonï¿½ Review: Will Ferrell an...",HBW_Doc5_HolmesAndWatson,HBW,Comedy,Negative,Holmes and Watson,holmes watson review will ferrell and john c ...,"[holmes, watson, review, ferrell, john, c, rei...","[holmes, watson, review, ferrell, john, c, rei...",holmes watson review ferrell john c reilly fai...


In [39]:
count_vectorizer = CountVectorizer()

In [40]:
dtm_class_corpus = count_vectorizer.fit_transform(class_corpus['lemmatized_text'])

In [41]:
word_counts_class_corpus = pd.DataFrame(dtm_class_corpus.toarray(), 
                                        columns=count_vectorizer.get_feature_names_out(), 
                                        index=class_corpus.index)

In [42]:
word_counts_class_corpus.head()

Unnamed: 0,aaannyway,abandon,abandoned,abc,abduct,abducted,abductor,abel,aberration,ability,...,zimmer,zippier,zipping,zippo,zoey,zombie,zombified,zone,zoolander,zora
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
(word_counts_class_corpus.sum()
                         .sort_values(ascending=False)
                         .head(25))

movie        466
film         445
one          323
time         252
like         228
get          192
bond         181
even         158
scene        146
make         143
character    141
good         132
story        127
much         118
go           118
also         114
way          114
would        108
holmes       103
thing        100
take          99
year          97
action        96
first         95
really        92
dtype: int64

Based on these prevalent terms, we will need to take a call on which of these are also important in the context of the movie reviews.

# Summary

Data wrangling in the context of text as input comes down to a two stage process:
1. Pre-processing (or *Normalizing*) the text by removing punctuation, tags & special characters, lower casing the words and removing the stop words. The end result at this point is a list of tokens for each review.
2. *Lemmatization* applied on the normalized list of tokens to bring them down to a common lemma. 