### Loading Libraries and Ingesting Data

In [13]:
#!pip install --upgrade pip

In [12]:
#!pip install phrasemachine
#!pip install nltk
#!pip install rake_nltk

In [81]:
# NLP Packages

import pandas as pd
import numpy as np
from collections import Counter
import phrasemachine
import nltk
from rake_nltk import Rake
import re
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import ngrams, FreqDist

In [15]:
# only need to run once
#nltk.download('stopwords')
#nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erinmcmahon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/erinmcmahon/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [46]:
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erinmcmahon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Adding Pre-processing functions

In [24]:
# Create a list of stop words from nltk
stop_words = set(stopwords.words("english"))

In [16]:
# Pre-process dataset to remove punctuation
def remove_punctuation(in_text):
    text = re.sub('[^a-zA-Z]', ' ', str(in_text))
    return text

In [17]:
# Pre-process dataset to lower case it
def lower_case(in_text):
    text = in_text.lower()    
    return text

In [18]:
# Pre-process dataset to remove tags
def remove_tags(in_text):    
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",in_text)
    return text

In [19]:
# Pre-process dataset to remove special characters and digits
def remove_special_chars_and_digits(in_text):
    text = re.sub("(\\d|\\W)+", " ", in_text)
    return text

In [20]:
# Pre-process dataset to appy Stemming
def apply_stemming(in_text):
    stemmer=PorterStemmer()
    word_list = nltk.word_tokenize(in_text)
    output = ' '.join([stemmer.stem(w) for w in word_list])
    return output

In [21]:
# Pre-process dataset to apply Lemmatization
def apply_lemmatization(in_text):
    # Lemmatization
    lem = WordNetLemmatizer()
    word_list = nltk.word_tokenize(in_text)
    output = ' '.join([lem.lemmatize(w) for w in word_list])
    return output

In [22]:
# Remove stop words
def remove_stop_words(in_text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(in_text)  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    return filtered_sentence

In [23]:
# Run Phase Machine
def run_phrase_machine(in_text):
    phrases=phrasemachine.get_phrases(in_text)
    return phrases

In [25]:
#Run Rake Keyword Extractor
def run_rake(in_text):
    r = Rake()
    r.extract_keywords_from_text(in_text)
    rake_phrases= r.get_ranked_phrases()
    return rake_phrases

Rapid Automatic Keyword Extraction algorithm that drives to determine key phrases in a body of text by analyzing word frequency and co-occurance with other words in the text.

In [26]:
# Run NLTK Tokenizer
def run_nltk_tokenizer(in_text):
    tokens=nltk.word_tokenize(in_text)
    return tokens

In [27]:
# Run NLTK Sentence Tokenizer
def run_nltk_sent_tokenizer(in_corpus):
    sents = nltk.sent_tokenize(in_corpus)
    return sents

In [28]:
#Run word-ngram Tokenizer
def run_nltk_tokenizer_word_ngrams(in_text, ngram_size):
    n_grams = ngrams(nltk.word_tokenize(in_text), ngram_size)
    return [ ' '.join(grams) for grams in n_grams]

In [29]:
#Get Frequ Dist 
def get_freq_dist(terms):
    all_counts = dict()
    all_counts = FreqDist(terms)
    return all_counts

In [32]:
text_df = pd.read_csv('https://raw.githubusercontent.com/erinmcmahon26/NLP-Chat-Bot/main/EMU_Movie_Reviews.csv')

In [33]:
text_df.shape

(10, 2)

In [34]:
text_df.info

<bound method DataFrame.info of                   FileName                                             Review
0   EMU_Doc1_TheConjuring3  I must admit that when I sat down to watch the...
1   EMU_Doc2_TheConjuring3  While The Conjuring franchise has stood as one...
2   EMU_Doc3_TheConjuring3  We’re well into the world and the lore of the ...
3   EMU_Doc4_TheConjuring3  James Wan's 2013 feature The Conjuring was som...
4   EMU_Doc5_TheConjuring3  Two Conjuring films and several spinoffs estab...
5   EMU_Doc6_TheConjuring3  Right from the first movie, James Wan had bigg...
6   EMU_Doc7_TheConjuring3  Money is no issue for The Conjuring films. The...
7   EMU_Doc8_TheConjuring3  When a film trots out the phrase “based on a t...
8   EMU_Doc9_TheConjuring3  The so-called "Conjuring universe" is so succe...
9  EMU_Doc10_TheConjuring3  I remember seeing James Wan’s The Conjuring fo...>

In [35]:
text_df.head()

Unnamed: 0,FileName,Review
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...


### EDA 

In [49]:
# Define a consolidated clean data function + tokenization

def clean_text_tokenize(document):
    remove_punc_text = remove_punctuation(document)
    lower_text =lower_case(remove_punc_text)
    remove_tag_text = remove_tags(lower_text)
    remove_special_chars_text = remove_special_chars_and_digits(remove_tag_text)
    stop_words = set(stopwords.words('english'))
    word_tokens = run_nltk_tokenizer(remove_special_chars_text)
    tokens = [w for w in word_tokens if not w in stop_words]
    tokens = []
    for w in word_tokens:
        if w not in stop_words:
            tokens.append(w)
    return tokens

Below we are applying all preprocessing/data cleaning functions to the movie reviews and separating each review into its own document:

In [67]:
text_df['tokens'] = text_df['Review'].apply(lambda x: clean_text_tokenize(x))

In [176]:
text_df.head()

Unnamed: 0,FileName,Review,tokens
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...,"[must, admit, sat, watch, addition, conjuring,..."
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...,"[conjuring, franchise, stood, one, successful,..."
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...,"[well, world, lore, warrens, ed, lorraine, fic..."
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...,"[james, wan, feature, conjuring, something, sp..."
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...,"[two, conjuring, films, several, spinoffs, est..."


In [73]:
text_df['tokens'].head()

0    [must, admit, sat, watch, addition, conjuring,...
1    [conjuring, franchise, stood, one, successful,...
2    [well, world, lore, warrens, ed, lorraine, fic...
3    [james, wan, feature, conjuring, something, sp...
4    [two, conjuring, films, several, spinoffs, est...
Name: tokens, dtype: object

In [143]:
text_df.iloc[1]

FileName                               EMU_Doc2_TheConjuring3
Review      While The Conjuring franchise has stood as one...
tokens      [conjuring, franchise, stood, one, successful,...
Name: 1, dtype: object

In [74]:
doc1 = text_df['tokens'].values[0]
doc2 = text_df['tokens'].values[1]
doc3 = text_df['tokens'].values[2]
doc4 = text_df['tokens'].values[3]
doc5 = text_df['tokens'].values[4]
doc6 = text_df['tokens'].values[5]
doc7 = text_df['tokens'].values[6]
doc8 = text_df['tokens'].values[7]
doc9 = text_df['tokens'].values[8]
doc10 = text_df['tokens'].values[9]

In [76]:
print(len(doc1))
print(type(doc1))
print(len(set(doc1)))

228
<class 'list'>
159


In [82]:
Counter(doc1).most_common(10)

[('movie', 12),
 ('conjuring', 11),
 ('devil', 10),
 ('made', 10),
 ('watch', 4),
 ('horror', 4),
 ('good', 4),
 ('franchise', 3),
 ('say', 3),
 ('experience', 3)]

In [83]:
all_docs = doc1+doc2+doc3+doc4+doc5+doc6+doc7+doc8+doc9+doc10

In [85]:
Counter(all_docs).most_common(10)

[('conjuring', 69),
 ('film', 57),
 ('made', 53),
 ('devil', 52),
 ('horror', 39),
 ('movie', 34),
 ('warrens', 31),
 ('series', 29),
 ('chaves', 28),
 ('franchise', 27)]

In [88]:
document1 = text_df.Review.values[0]
document2 = text_df.Review.values[1]
document3 = text_df.Review.values[2]
document4 = text_df.Review.values[3]
document5 = text_df.Review.values[4]
document6 = text_df.Review.values[5]
document7 = text_df.Review.values[6]
document8 = text_df.Review.values[7]
document9 = text_df.Review.values[8]
document10 = text_df.Review.values[9]

In [90]:
#Run this first to get sentences from text.
sentences_1=run_nltk_sent_tokenizer(document1)
sentences_2=run_nltk_sent_tokenizer(document2)
sentences_3=run_nltk_sent_tokenizer(document3)
sentences_4=run_nltk_sent_tokenizer(document4)
sentences_5=run_nltk_sent_tokenizer(document5)
sentences_6=run_nltk_sent_tokenizer(document6)
sentences_7=run_nltk_sent_tokenizer(document7)
sentences_8=run_nltk_sent_tokenizer(document8)
sentences_9=run_nltk_sent_tokenizer(document9)
sentences_10=run_nltk_sent_tokenizer(document10)

In [106]:
terms_1 = []
for sentence in sentences_1:
    terms_1 = terms_1 + run_rake(sentence)

fd_1 = get_freq_dist(terms_1)
fd_1

FreqDist({'conjuring': 11, 'devil made': 10, 'movie': 8, 'watch': 4, 'franchise': 3, 'say': 3, 'movies': 3, 'sit': 2, 'course': 2, 'less': 2, ...})

In [94]:
terms_2 = []
for sentence in sentences_2:
    terms_2 = terms_2 + run_rake(sentence)

fd_2 = get_freq_dist(terms_2)
fd_2

FreqDist({'film': 10, 'devil made': 6, 'conjuring': 6, 'franchise': 4, 'one': 3, 'focus': 3, 'sense': 3, 'feature': 3, 'conjuring franchise': 2, 'enough': 2, ...})

In [95]:
terms_3 = []
for sentence in sentences_3:
    terms_3 = terms_3 + run_rake(sentence)

fd_3 = get_freq_dist(terms_3)
fd_3

FreqDist({'’': 9, 'love': 3, 'devil made': 3, 'warren ’': 3, 'wan': 3, 'franchise': 3, 'well': 2, 'real': 2, 'arne johnson': 2, 'conjuring': 2, ...})

In [96]:
terms_4 = []
for sentence in sentences_4:
    terms_4 = terms_4 + run_rake(sentence)

fd_4 = get_freq_dist(terms_4)
fd_4

FreqDist({'well': 5, 'conjuring': 4, 'warrens': 4, 'ed': 3, 'devil made': 3, 'lorraine': 3, 'bring': 3, 'see': 3, 'one': 3, 'real': 2, ...})

In [97]:
terms_5 = []
for sentence in sentences_5:
    terms_5 = terms_5 + run_rake(sentence)

fd_5 = get_freq_dist(terms_5)
fd_5

FreqDist({'’': 4, 'devil made': 3, 'series': 3, 'lorraine': 3, 'wan': 3, 'ed': 2, 'much': 2, 'two conjuring films': 1, 'several spinoffs established': 1, 'substantial following': 1, ...})

In [98]:
terms_6 = []
for sentence in sentences_6:
    terms_6 = terms_6 + run_rake(sentence)

fd_6 = get_freq_dist(terms_6)
fd_6

FreqDist({'series': 5, 'devil made': 5, 'conjuring': 5, 'lorraine warren': 3, 'real': 3, 'warrens': 3, 'david glatzel': 3, 'johnson': 3, 'movie': 3, 'right': 2, ...})

In [99]:
terms_7 = []
for sentence in sentences_7:
    terms_7 = terms_7 + run_rake(sentence)

fd_7 = get_freq_dist(terms_7)
fd_7

FreqDist({'’': 11, 'devil made': 7, 'arne ’': 3, 'issue': 2, 'series': 2, 'one': 2, 'chaves': 2, 'innocence': 2, 'buy': 2, 'conjuring films': 1, ...})

In [100]:
terms_8 = []
for sentence in sentences_8:
    terms_8 = terms_8 + run_rake(sentence)

fd_8 = get_freq_dist(terms_8)
fd_8

FreqDist({'’': 20, 'well': 6, '“': 4, 'film': 4, 'devil made': 3, 'conjuring': 3, 'though': 3, 'warrens': 3, 'work': 3, 'couple': 2, ...})

In [101]:
terms_9 = []
for sentence in sentences_9:
    terms_9 = terms_9 + run_rake(sentence)

fd_9 = get_freq_dist(terms_9)
fd_9

FreqDist({'’': 8, 'series': 2, 'see': 2, 'lorraine ’': 2, 'ed': 2, 'role': 2, 'even': 2, 'conjuring universe': 1, 'successful': 1, 'scary': 1, ...})

In [102]:
terms_10 = []
for sentence in sentences_10:
    terms_10 = terms_10 + run_rake(sentence)

fd_10 = get_freq_dist(terms_10)
fd_10

FreqDist({'’': 24, 'devil made': 8, 'jump scares': 5, 'film': 5, 'conjuring': 4, 'audience': 4, 'story': 4, 'way': 3, 'la llorona': 3, 'ever': 3, ...})

In [117]:
fd_1_list = list(fd_1.most_common(10))
fd_2_list = list(fd_2.most_common(10))

In [120]:
fd_1_df = pd.DataFrame(fd_1_list, columns = ['FDDoc1', 'FDDoc1_Counts'])
fd_1_df

Unnamed: 0,FDDoc1,FDDoc1_Counts
0,conjuring,11
1,devil made,10
2,movie,8
3,watch,4
4,franchise,3
5,say,3
6,movies,3
7,sit,2
8,course,2
9,less,2


In [121]:
fd_2_df = pd.DataFrame(fd_2_list, columns = ['FDDoc2', 'FDDoc2_Counts'])
fd_2_df

Unnamed: 0,FDDoc2,FDDoc2_Counts
0,film,10
1,devil made,6
2,conjuring,6
3,franchise,4
4,one,3
5,focus,3
6,sense,3
7,feature,3
8,conjuring franchise,2
9,enough,2


In [122]:
#Run this first to get sentences from text.
sentences=run_nltk_sent_tokenizer(document1)

In [124]:
#nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/erinmcmahon/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [125]:
#Explore different extractors and difference preprocessing techniques
for sentence in sentences:
    print(sentence)
    print("===================NLTK Tokenizer===================")
    print(run_nltk_tokenizer(sentence))
    print("===================NLTK Word NGRAM Tokenizer 2 words===================")
    print(run_nltk_tokenizer_word_ngrams(sentence,2))
    print("===================NLTK Word NGRAM Tokenizer 3 words===================")
    print(run_nltk_tokenizer_word_ngrams(sentence,3))
    print("===================Phrase Machine===================")
    #phrases=run_phrase_machine(sentence)
    #for term in phrases["counts"].keys():
    #    print(term)
    print("===================Rake===================")
    print(run_rake(sentence))
    print("===================NLTK Tokenizer===================")
    print(run_nltk_tokenizer((sentence)))
    print("===================NLTK Tokenizer LOWER CASE===================")
    print(run_nltk_tokenizer(lower_case(sentence)))
    print("===================NLTK Tokenizer REMOVE STOP WORDS===================")
    print(remove_stop_words(sentence))   
    print("===================NLTK Tokenizer REMOVED PUNCTUATION===================")
    print(run_nltk_tokenizer(remove_punctuation(sentence)))
    print("===================NLTK Tokenizer REMOVED TAGS===================")
    print(run_nltk_tokenizer(remove_tags(sentence)))
    print("===================NLTK Tokenizer REMOVED CHARS AND DIGITS===================")
    print(run_nltk_tokenizer(remove_special_chars_and_digits(sentence)))
    print("===================NLTK Tokenizer STEMMING APPLIED===================")
    print(run_nltk_tokenizer(apply_stemming(sentence)))
    print("===================NLTK Tokenizer LEMMATIZATION APPLIED===================")
    print(run_nltk_tokenizer(apply_lemmatization(sentence)))
    #break

I must admit that when I sat down to watch the 2021 addition to "The Conjuring" franchise, I was not harboring much of any overly great expectations or hopes, because since the first movie it has been a steady downward slope.
['I', 'must', 'admit', 'that', 'when', 'I', 'sat', 'down', 'to', 'watch', 'the', '2021', 'addition', 'to', '``', 'The', 'Conjuring', "''", 'franchise', ',', 'I', 'was', 'not', 'harboring', 'much', 'of', 'any', 'overly', 'great', 'expectations', 'or', 'hopes', ',', 'because', 'since', 'the', 'first', 'movie', 'it', 'has', 'been', 'a', 'steady', 'downward', 'slope', '.']
['I must', 'must admit', 'admit that', 'that when', 'when I', 'I sat', 'sat down', 'down to', 'to watch', 'watch the', 'the 2021', '2021 addition', 'addition to', 'to ``', '`` The', 'The Conjuring', "Conjuring ''", "'' franchise", 'franchise ,', ', I', 'I was', 'was not', 'not harboring', 'harboring much', 'much of', 'of any', 'any overly', 'overly great', 'great expectations', 'expectations or', 'o

EDA with a dictionary created using a for loop instead of the above code:

In [174]:
# for loop version

reviews = {}
for row in text_df.iloc:
    key = row.FileName
    value = run_nltk_sent_tokenizer(row.Review)
    reviews[key] = value
reviews

{'EMU_Doc1_TheConjuring3': ['I must admit that when I sat down to watch the 2021 addition to "The Conjuring" franchise, I was not harboring much of any overly great expectations or hopes, because since the first movie it has been a steady downward slope.',
  'Still, as I had the chance to sit down and watch "The Conjuring: The Devil Made Me Do It" from writers David Leslie Johnson-McGoldrick and James Wan.',
  'So of course I did it.',
  'And I have to say that director Michael Chaves managed to deliver a movie that was only slightly entertaining.',
  '"The Conjuring: The Devil Made Me Do It" was a whole lot of nothing going on, and you can essentially just watch the beginning and the last 25 minutes of the movie and skip on everything in between.',
  'The storyline written for "The Conjuring: The Devil Made Me Do It" was bland and slow paced, with very little of much excitement or interest happening in between the start and the end of the movie.',
  'And that ultimately led to a less 

In [175]:
for key in reviews:
    print("this is the file name: " + key)
    for sentence in reviews[key]:
        print(sentence)
        print("===================NLTK Tokenizer===================")
        print(run_nltk_tokenizer(sentence))       
        break
    break
    
for sentence in reviews['EMU_Doc1_TheConjuring3']:
    print(sentence)
    print("===================NLTK Tokenizer===================")
    print(run_nltk_tokenizer(sentence)) 
    break

this is the file name: EMU_Doc1_TheConjuring3
I must admit that when I sat down to watch the 2021 addition to "The Conjuring" franchise, I was not harboring much of any overly great expectations or hopes, because since the first movie it has been a steady downward slope.
['I', 'must', 'admit', 'that', 'when', 'I', 'sat', 'down', 'to', 'watch', 'the', '2021', 'addition', 'to', '``', 'The', 'Conjuring', "''", 'franchise', ',', 'I', 'was', 'not', 'harboring', 'much', 'of', 'any', 'overly', 'great', 'expectations', 'or', 'hopes', ',', 'because', 'since', 'the', 'first', 'movie', 'it', 'has', 'been', 'a', 'steady', 'downward', 'slope', '.']
I must admit that when I sat down to watch the 2021 addition to "The Conjuring" franchise, I was not harboring much of any overly great expectations or hopes, because since the first movie it has been a steady downward slope.
['I', 'must', 'admit', 'that', 'when', 'I', 'sat', 'down', 'to', 'watch', 'the', '2021', 'addition', 'to', '``', 'The', 'Conjuring