## Load Libraries and Ingest Data

In [68]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import phrasemachine
import nltk
from rake_nltk import Rake
import re
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import ngrams, FreqDist

In [61]:
text_df = pd.read_csv('https://raw.githubusercontent.com/erinmcmahon26/NLP-Chat-Bot/main/EMU_Movie_Reviews.csv')

In [51]:
text_df.head()

Unnamed: 0,FileName,Review
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...


## EDA

### Tokenization and Normalization

In [202]:
# create corpus
corpus = text_df.Review
print(len(corpus))

10


hmmmmm how do I keep from splitting up can't to can t and have it do cant instead...?

In [81]:
def remove_punctuation(in_text):
    words = in_text.split()
    text = re.sub('[^a-zA-Z]', ' ', str(words))
    return text

In [175]:
clean = text_df['Review'].apply(lambda x:remove_punctuation(x))

In [177]:
clean[0]

'  I    must    admit    that    when    I    sat    down    to    watch    the            addition    to     The    Conjuring     franchise     I    was    not    harboring    much    of    any    overly    great    expectations    or    hopes     because    since    the    first    movie    it    has    been    a    steady    downward    slope     Still     as    I    had    the    chance    to    sit    down    and    watch     The    Conjuring     The    Devil    Made    Me    Do    It     from    writers    David    Leslie    Johnson McGoldrick    and    James    Wan     So    of    course    I    did    it     And    I    have    to    say    that    director    Michael    Chaves    managed    to    deliver    a    movie    that    was    only    slightly    entertaining      The    Conjuring     The    Devil    Made    Me    Do    It     was    a    whole    lot    of    nothing    going    on     and    you    can    essentially    just    watch    the    beginning    and    th

In [82]:
def lower_case(in_text):
    text = in_text.lower()    
    return text

In [83]:
def remove_tags(in_text):    
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",in_text)
    return text

In [84]:
def remove_special_chars_and_digits(in_text):
    text = re.sub("(\\d|\\W)+", " ", in_text)
    return text

In [85]:
# combined function to clean reviews
# take out puntucation, lower case all words, remove special characters
def clean_text(document):
    remove_punc_text = remove_punctuation(document)
    lower_text =lower_case(remove_punc_text)
    remove_tag_text = remove_tags(lower_text)
    remove_special_chars_text = remove_special_chars_and_digits(remove_tag_text)
    return remove_special_chars_text

In [86]:
text_df['clean_text'] = text_df['Review'].apply(lambda x:clean_text(x))

In [58]:
text_df.head()

Unnamed: 0,FileName,Review,clean_text
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...,i must admit that when i sat down to watch th...
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...,while the conjuring franchise has stood as on...
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...,we re well into the world and the lore of the...
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...,james wan s feature the conjuring was somethi...
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...,two conjuring films and several spinoffs esta...


In [43]:
# confirm that the function did what it intended to do - yep!
text_df['clean_text'][0]

'i must admit that when i sat down to watch the addition to the conjuring franchise i was not harboring much of any overly great expectations or hopes because since the first movie it has been a steady downward slope still as i had the chance to sit down and watch the conjuring the devil made me do it from writers david leslie johnson mcgoldrick and james wan so of course i did it and i have to say that director michael chaves managed to deliver a movie that was only slightly entertaining the conjuring the devil made me do it was a whole lot of nothing going on and you can essentially just watch the beginning and the last minutes of the movie and skip on everything in between the storyline written for the conjuring the devil made me do it was bland and slow paced with very little of much excitement or interest happening in between the start and the end of the movie and that ultimately led to a less than mediocre movie experience for me and yeah i am a horror veteran so the conjuring th

In [93]:
from nltk.tokenize import TreebankWordTokenizer
def run_nltk_tokenizer(in_text):
    tokenizer = TreebankWordTokenizer()
    tokens=tokenizer.tokenize(in_text)
    return tokens

In [94]:
# removing stop words and tokenizing the clean text
def text_tokenize(document):
    stop_words = set(stopwords.words('english'))
    word_tokens = run_nltk_tokenizer(document)
    tokens = [w for w in word_tokens if not w in stop_words]
    tokens = []
    for w in word_tokens:
        if w not in stop_words:
            tokens.append(w)
    return tokens

In [95]:
text_df['text_tokens'] = text_df['clean_text'].apply(lambda x:text_tokenize(x))

In [46]:
text_df.head()

Unnamed: 0,FileName,Review,clean_text,text_tokens
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...,i must admit that when i sat down to watch the...,"[must, admit, sat, watch, addition, conjuring,..."
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...,while the conjuring franchise has stood as one...,"[conjuring, franchise, stood, one, successful,..."
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...,we re well into the world and the lore of the ...,"[well, world, lore, warrens, ed, lorraine, fic..."
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...,james wan s feature the conjuring was somethin...,"[james, wan, feature, conjuring, something, sp..."
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...,two conjuring films and several spinoffs estab...,"[two, conjuring, films, several, spinoffs, est..."


In [98]:
text_df['text_tokens'][0]

['must',
 'admit',
 'sat',
 'watch',
 'addition',
 'conjuring',
 'franchise',
 'harboring',
 'much',
 'overly',
 'great',
 'expectations',
 'hopes',
 'since',
 'first',
 'movie',
 'steady',
 'downward',
 'slope',
 'still',
 'chance',
 'sit',
 'watch',
 'conjuring',
 'devil',
 'made',
 'writers',
 'david',
 'leslie',
 'johnson',
 'mcgoldrick',
 'james',
 'wan',
 'course',
 'say',
 'director',
 'michael',
 'chaves',
 'managed',
 'deliver',
 'movie',
 'slightly',
 'entertaining',
 'conjuring',
 'devil',
 'made',
 'whole',
 'lot',
 'nothing',
 'going',
 'essentially',
 'watch',
 'beginning',
 'last',
 'minutes',
 'movie',
 'skip',
 'everything',
 'storyline',
 'written',
 'conjuring',
 'devil',
 'made',
 'bland',
 'slow',
 'paced',
 'little',
 'much',
 'excitement',
 'interest',
 'happening',
 'start',
 'end',
 'movie',
 'ultimately',
 'led',
 'less',
 'mediocre',
 'movie',
 'experience',
 'yeah',
 'horror',
 'veteran',
 'conjuring',
 'devil',
 'made',
 'walk',
 'park',
 'jump',
 'scare',


In [124]:
Counter(text_df['text_tokens'][0]).most_common(10)

[('movie', 12),
 ('conjuring', 11),
 ('devil', 10),
 ('made', 10),
 ('watch', 4),
 ('horror', 4),
 ('good', 4),
 ('franchise', 3),
 ('say', 3),
 ('experience', 3)]

### Most common words over all documents

In [131]:
# list of all tokens from each document
# not sure I need this
bag_of_words = Counter()
for row in text_df.iloc:
    bag_of_words += Counter(row.text_tokens)
    
len(bag_of_words)

1813

In [132]:
type(bag_of_words)

collections.Counter

In [133]:
Counter(bag_of_words).most_common(20)

[('conjuring', 69),
 ('film', 57),
 ('made', 53),
 ('devil', 52),
 ('horror', 39),
 ('movie', 34),
 ('warrens', 31),
 ('series', 29),
 ('chaves', 28),
 ('franchise', 27),
 ('wan', 27),
 ('ed', 27),
 ('lorraine', 27),
 ('story', 26),
 ('arne', 26),
 ('farmiga', 24),
 ('wilson', 23),
 ('films', 22),
 ('well', 21),
 ('david', 20)]

In [201]:
# creating a list of all vocabulary
total_vocab = [x for x in bag_of_words]
total_vocab

['must',
 'admit',
 'sat',
 'watch',
 'addition',
 'conjuring',
 'franchise',
 'harboring',
 'much',
 'overly',
 'great',
 'expectations',
 'hopes',
 'since',
 'first',
 'movie',
 'steady',
 'downward',
 'slope',
 'still',
 'chance',
 'sit',
 'devil',
 'made',
 'writers',
 'david',
 'leslie',
 'johnson',
 'mcgoldrick',
 'james',
 'wan',
 'course',
 'say',
 'director',
 'michael',
 'chaves',
 'managed',
 'deliver',
 'slightly',
 'entertaining',
 'whole',
 'lot',
 'nothing',
 'going',
 'essentially',
 'beginning',
 'last',
 'minutes',
 'skip',
 'everything',
 'storyline',
 'written',
 'bland',
 'slow',
 'paced',
 'little',
 'excitement',
 'interest',
 'happening',
 'start',
 'end',
 'ultimately',
 'led',
 'less',
 'mediocre',
 'experience',
 'yeah',
 'horror',
 'veteran',
 'walk',
 'park',
 'jump',
 'scare',
 'moments',
 'could',
 'seen',
 'coming',
 'mile',
 'away',
 'particularly',
 'dark',
 'brooding',
 'scary',
 'impressive',
 'foray',
 'genre',
 'special',
 'effects',
 'good',
 'def

In [194]:
# get tf of one word in all the documents
tf_conjuring = bag_of_words['conjuring']/len(bag_of_words)
tf_conjuring

0.038058466629895205

#### Vectorizing

Turning the top 20 most common words into vectors

In [198]:
vector = []
for key, value in bag_of_words.most_common(20):
    vector.append(value/len(bag_of_words))
vector

[0.038058466629895205,
 0.03143960286817429,
 0.02923331494760066,
 0.028681742967457253,
 0.02151130722559294,
 0.018753447324875896,
 0.01709873138444567,
 0.015995587424158852,
 0.015444015444015444,
 0.014892443463872035,
 0.014892443463872035,
 0.014892443463872035,
 0.014892443463872035,
 0.014340871483728626,
 0.014340871483728626,
 0.013237727523441808,
 0.0126861555432984,
 0.012134583563154992,
 0.011583011583011582,
 0.011031439602868174]

In [208]:
tf = text_df.text_tokens.apply(lambda x:pd.Series(x).value_counts()).fillna(0)

In [209]:
tf.sort_index(inplace=True, axis=1)
tf

Unnamed: 0,able,absent,absolutely,accepted,account,accurate,achieve,achieves,achieving,act,...,yeah,year,yes,yesteryear,yet,young,younger,zealous,zero,zips
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [210]:
idf = pd.Series([np.log((float(text_df.shape[0])+1)/(len([x for x in text_df.text_tokens.values if token in x])+1))+1 for token in tf.columns])
idf.index = tf.columns
print(idf)

able          2.704748
absent        2.299283
absolutely    2.299283
accepted      2.704748
account       2.704748
                ...   
young         2.011601
younger       2.704748
zealous       2.704748
zero          2.704748
zips          2.704748
Length: 1813, dtype: float64


### Comparing EDA Methods

#### Method 1

Remove punctuation, stop words, tags, and made lower case. No other text cleaning.