## Load Libraries and Ingest Data

In [68]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import phrasemachine
import nltk
from rake_nltk import Rake
import re
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import ngrams, FreqDist

In [380]:
text_df = pd.read_csv('https://raw.githubusercontent.com/erinmcmahon26/NLP-Chat-Bot/main/EMU_Movie_Reviews.csv')

In [381]:
text_df.head()

Unnamed: 0,FileName,Review
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...


## EDA

### Tokenization and Normalization

In [382]:
# create corpus
corpus = text_df.Review
print(len(corpus))

10


hmmmmm how do I keep from splitting up can't to can t and have it do cant instead...?

In [383]:
def remove_punctuation(in_text):
    text = re.sub('[^a-zA-Z]', ' ', str(in_text))
    return text

In [334]:
def lower_case(in_text):
    text = in_text.lower()    
    return text

In [335]:
def remove_tags(in_text):    
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",in_text)
    return text

In [336]:
def remove_special_chars_and_digits(in_text):
    text = re.sub("(\\d|\\W)+", " ", in_text)
    return text

In [337]:
# combined function to clean reviews
# take out puntucation, lower case all words, remove special characters
def clean_text(document):
    remove_punc_text = remove_punctuation(document)
    lower_text =lower_case(remove_punc_text)
    remove_tag_text = remove_tags(lower_text)
    remove_special_chars_text = remove_special_chars_and_digits(remove_tag_text)
    return remove_special_chars_text

In [338]:
text_df['clean_text'] = text_df['Review'].apply(lambda x:clean_text(x))

In [339]:
text_df.head()

Unnamed: 0,FileName,Review,clean_text
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...,i must admit that when i sat down to watch th...
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...,while the conjuring franchise has stood as on...
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...,we re well into the world and the lore of the...
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...,james wan s feature the conjuring was somethi...
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...,two conjuring films and several spinoffs esta...


In [340]:
# confirm that the function did what it intended to do - yep!
text_df['clean_text'][0]

' i must admit that when i sat down to watch the addition to the conjuring franchise i was not harboring much of any overly great expectations or hopes because since the first movie it has been a steady downward slope still as i had the chance to sit down and watch the conjuring the devil made me do it from writers david leslie johnson mcgoldrick and james wan so of course i did it and i have to say that director michael chaves managed to deliver a movie that was only slightly entertaining the conjuring the devil made me do it was a whole lot of nothing going on and you can essentially just watch the beginning and the last minutes of the movie and skip on everything in between the storyline written for the conjuring the devil made me do it was bland and slow paced with very little of much excitement or interest happening in between the start and the end of the movie and that ultimately led to a less than mediocre movie experience for me and yeah i am a horror veteran so the conjuring t

In [341]:
from nltk.tokenize import TreebankWordTokenizer
def run_nltk_tokenizer(in_text):
    tokenizer = TreebankWordTokenizer()
    tokens=tokenizer.tokenize(in_text)
    return tokens

In [342]:
# Pre-process dataset to apply Lemmatization
def apply_lemmatization(in_text):
    # Lemmatization
    lem = WordNetLemmatizer()
    word_list = nltk.word_tokenize(in_text)
    output = ' '.join([lem.lemmatize(w) for w in word_list])
    return output

In [343]:
# removing stop words and tokenizing the clean text
def text_tokenize(document):
    stop_words = set(stopwords.words('english'))
    word_tokens = run_nltk_tokenizer(apply_lemmatization(document))
    tokens = [w for w in word_tokens if not w in stop_words]
    tokens = []
    for w in word_tokens:
        if w not in stop_words:
            tokens.append(w)
    return tokens

In [344]:
text_df['text_tokens'] = text_df['clean_text'].apply(lambda x:text_tokenize(x))

In [345]:
text_df.head()

Unnamed: 0,FileName,Review,clean_text,text_tokens
0,EMU_Doc1_TheConjuring3,I must admit that when I sat down to watch the...,i must admit that when i sat down to watch th...,"[must, admit, sat, watch, addition, conjuring,..."
1,EMU_Doc2_TheConjuring3,While The Conjuring franchise has stood as one...,while the conjuring franchise has stood as on...,"[conjuring, franchise, ha, stood, one, success..."
2,EMU_Doc3_TheConjuring3,We’re well into the world and the lore of the ...,we re well into the world and the lore of the...,"[well, world, lore, warren, ed, lorraine, fict..."
3,EMU_Doc4_TheConjuring3,James Wan's 2013 feature The Conjuring was som...,james wan s feature the conjuring was somethi...,"[james, wan, feature, conjuring, wa, something..."
4,EMU_Doc5_TheConjuring3,Two Conjuring films and several spinoffs estab...,two conjuring films and several spinoffs esta...,"[two, conjuring, film, several, spinoffs, esta..."


In [346]:
text_df['text_tokens'][0]

['must',
 'admit',
 'sat',
 'watch',
 'addition',
 'conjuring',
 'franchise',
 'wa',
 'harboring',
 'much',
 'overly',
 'great',
 'expectation',
 'hope',
 'since',
 'first',
 'movie',
 'ha',
 'steady',
 'downward',
 'slope',
 'still',
 'chance',
 'sit',
 'watch',
 'conjuring',
 'devil',
 'made',
 'writer',
 'david',
 'leslie',
 'johnson',
 'mcgoldrick',
 'james',
 'wan',
 'course',
 'say',
 'director',
 'michael',
 'chaves',
 'managed',
 'deliver',
 'movie',
 'wa',
 'slightly',
 'entertaining',
 'conjuring',
 'devil',
 'made',
 'wa',
 'whole',
 'lot',
 'nothing',
 'going',
 'essentially',
 'watch',
 'beginning',
 'last',
 'minute',
 'movie',
 'skip',
 'everything',
 'storyline',
 'written',
 'conjuring',
 'devil',
 'made',
 'wa',
 'bland',
 'slow',
 'paced',
 'little',
 'much',
 'excitement',
 'interest',
 'happening',
 'start',
 'end',
 'movie',
 'ultimately',
 'led',
 'le',
 'mediocre',
 'movie',
 'experience',
 'yeah',
 'horror',
 'veteran',
 'conjuring',
 'devil',
 'made',
 'wa',
 

In [347]:
Counter(text_df['text_tokens'][0]).most_common(10)

[('movie', 15),
 ('wa', 12),
 ('conjuring', 11),
 ('devil', 10),
 ('made', 10),
 ('watch', 4),
 ('horror', 4),
 ('good', 4),
 ('franchise', 3),
 ('say', 3)]

### Most common words over all documents

In [348]:
# list of all tokens from each document
# not sure I need this
bag_of_words = Counter()
for row in text_df.iloc:
    bag_of_words += Counter(row.text_tokens)
    
len(bag_of_words)

1705

In [349]:
type(bag_of_words)

collections.Counter

In [350]:
Counter(bag_of_words).most_common(20)

[('film', 79),
 ('conjuring', 69),
 ('made', 53),
 ('devil', 52),
 ('warren', 49),
 ('movie', 43),
 ('horror', 40),
 ('wa', 35),
 ('scare', 29),
 ('series', 29),
 ('chaves', 28),
 ('franchise', 27),
 ('ha', 27),
 ('wan', 27),
 ('story', 27),
 ('ed', 27),
 ('lorraine', 27),
 ('arne', 26),
 ('farmiga', 24),
 ('wilson', 23)]

In [351]:
# creating a list of all vocabulary
total_vocab = [x for x in bag_of_words]
total_vocab

['must',
 'admit',
 'sat',
 'watch',
 'addition',
 'conjuring',
 'franchise',
 'wa',
 'harboring',
 'much',
 'overly',
 'great',
 'expectation',
 'hope',
 'since',
 'first',
 'movie',
 'ha',
 'steady',
 'downward',
 'slope',
 'still',
 'chance',
 'sit',
 'devil',
 'made',
 'writer',
 'david',
 'leslie',
 'johnson',
 'mcgoldrick',
 'james',
 'wan',
 'course',
 'say',
 'director',
 'michael',
 'chaves',
 'managed',
 'deliver',
 'slightly',
 'entertaining',
 'whole',
 'lot',
 'nothing',
 'going',
 'essentially',
 'beginning',
 'last',
 'minute',
 'skip',
 'everything',
 'storyline',
 'written',
 'bland',
 'slow',
 'paced',
 'little',
 'excitement',
 'interest',
 'happening',
 'start',
 'end',
 'ultimately',
 'led',
 'le',
 'mediocre',
 'experience',
 'yeah',
 'horror',
 'veteran',
 'walk',
 'park',
 'jump',
 'scare',
 'moment',
 'could',
 'seen',
 'coming',
 'mile',
 'away',
 'particularly',
 'dark',
 'brooding',
 'scary',
 'impressive',
 'foray',
 'genre',
 'special',
 'effect',
 'good',

In [284]:
# get tf of one word in all the documents
tf_conjuring = bag_of_words['conjuring']/len(bag_of_words)
tf_conjuring

0.04046920821114369

#### Vectorizing

Turning the top 20 most common words into vectors

In [285]:
vector = []
for key, value in bag_of_words.most_common(20):
    vector.append(value/len(bag_of_words))
vector

[0.04633431085043988,
 0.04046920821114369,
 0.031085043988269796,
 0.030498533724340176,
 0.02873900293255132,
 0.025219941348973606,
 0.02346041055718475,
 0.020527859237536656,
 0.017008797653958945,
 0.017008797653958945,
 0.016422287390029325,
 0.015835777126099706,
 0.015835777126099706,
 0.015835777126099706,
 0.015835777126099706,
 0.015835777126099706,
 0.015835777126099706,
 0.015249266862170088,
 0.01407624633431085,
 0.013489736070381233]

### Get TF-IDF of words in all documents

Using sklearn

In [286]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [287]:
# using the clean tokens created from above
tf_idf_model = TfidfVectorizer()
tf_idf_model.fit(total_vocab)

TfidfVectorizer()

In [288]:
tf_idf_model.vocabulary_

{'must': 999,
 'admit': 21,
 'sat': 1293,
 'watch': 1651,
 'addition': 19,
 'conjuring': 280,
 'franchise': 630,
 'wa': 1635,
 'harboring': 710,
 'much': 992,
 'overly': 1078,
 'great': 684,
 'expectation': 539,
 'hope': 748,
 'since': 1356,
 'first': 598,
 'movie': 990,
 'ha': 699,
 'steady': 1424,
 'downward': 440,
 'slope': 1373,
 'still': 1429,
 'chance': 215,
 'sit': 1361,
 'devil': 402,
 'made': 925,
 'writer': 1688,
 'david': 363,
 'leslie': 885,
 'johnson': 831,
 'mcgoldrick': 949,
 'james': 824,
 'wan': 1639,
 'course': 315,
 'say': 1299,
 'director': 417,
 'michael': 966,
 'chaves': 224,
 'managed': 934,
 'deliver': 383,
 'slightly': 1371,
 'entertaining': 496,
 'whole': 1665,
 'lot': 916,
 'nothing': 1028,
 'going': 672,
 'essentially': 503,
 'beginning': 123,
 'last': 863,
 'minute': 974,
 'skip': 1366,
 'everything': 514,
 'storyline': 1433,
 'written': 1689,
 'bland': 146,
 'slow': 1374,
 'paced': 1082,
 'little': 902,
 'excitement': 525,
 'interest': 805,
 'happening': 7

In [398]:
vectorizer.idf_

array([7.81014245, 7.81014245, 7.81014245, ..., 7.81014245, 7.81014245,
       7.81014245])

In [289]:
tf_idf_vector = tf_idf_model.fit_transform(total_vocab)

In [290]:
print(tf_idf_vector.shape)
print(type(tf_idf_vector))

(1705, 1702)
<class 'scipy.sparse.csr.csr_matrix'>


In [292]:
tf_idf_array = tf_idf_vector.toarray()
print(tf_idf_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [293]:
words_set = tf_idf_model.get_feature_names_out()
print(words_set)

['able' 'absent' 'absolutely' ... 'zealous' 'zero' 'zip']


In [294]:
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)
df_tf_idf

Unnamed: 0,able,absent,absolutely,accepted,account,accurate,achieve,achieves,achieving,act,...,yeah,year,yes,yesteryear,yet,young,younger,zealous,zero,zip
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [405]:
# well the .mean isn't helpful but I can't get it to work without a function there
weights = np.asarray(tf_idf_vector.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tf_idf_model.get_feature_names_out(), 'weight':weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
0,able,0.000587
1143,point,0.000587
1141,plucked,0.000587
1140,plot,0.000587
1139,plenty,0.000587
1138,pleasing,0.000587
1137,please,0.000587
1136,plead,0.000587
1135,playing,0.000587
1134,player,0.000587


In [403]:
vector.shape

(1, 1810)

In [402]:
vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])