# Movie Reviews

In [1]:
import pandas as pd
import re
from sklearn.pipeline import Pipeline
import string
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from  sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [2]:
# locating the default characters considered as punctuations.

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [3]:
import string
import re
# Define a function to remove punctuation in our messages
def no_punctuation(reviews):
    reviews = "".join([char for char in reviews if char not in string.punctuation])
    return reviews
def removelines(value):
    return ''.join(value.splitlines())
def lower_text(clean_text):
    # converting clean text to lowercase
    clean_text = clean_text.lower()
    return clean_text
def remove_num(clean_text):
    # remove numbers
    clean_textnonum = re.sub(r'\d+', '', clean_text)
    return clean_textnonum
data['reviews'] = data['reviews'].apply(lambda x: removelines(x))
data['reviews'] = data['reviews'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]"," ",x.lower().strip()))
data['reviews'] = data['reviews'].apply(lambda x: no_punctuation(x))
data['reviews']=data['reviews'].apply(lambda x: lower_text(x))
data['reviews']=data['reviews'].apply(lambda x: remove_num(x))
data.head()

Unnamed: 0,target,reviews
0,neg,plot two teen couples go to a church party ...
1,neg,the happy bastard s quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...
3,neg,quest for camelot is warner bros first...
4,neg,synopsis a mentally unstable man undergoing ...


In [4]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [5]:
stop_words.extend([ 'first', 'second', 'third', 'me', 'haha', 'lol', 'oof', 'cds'])#we added to our list of stopwords

In [6]:
# Lemmatize
import nltk
from nltk.tokenize import word_tokenize
def lemmatizing_text(clean_text):
    #words= nltk.word_tokenize(x)
    clean_text = clean_text.apply(lambda x: ' ' .join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x) if word not in stop_words]))
    return clean_text

data['reviews'] = lemmatizing_text(data['reviews'])

data.head()

    #return clean_tokens  

Unnamed: 0,target,reviews
0,neg,plot two teen couple go church party drink dri...
1,neg,happy bastard quick movie review damn yk bug g...
2,neg,movie like make jaded movie viewer thankful in...
3,neg,quest camelot warner bros feature length fully...
4,neg,synopsis mentally unstable man undergoing psyc...


In [7]:
# Save dataframe into csv file
data.to_csv(r'data.csv', index = False)

## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [8]:
# Create CountVectorizer object
vectorizer = CountVectorizer()# (1,1) means we only search ngrams.

In [9]:
# Generate matrix of word vectors
bow = vectorizer.fit_transform(data['reviews'])

In [10]:
# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow.toarray())

In [11]:
# Map the column names to vocabulary
bow_df.columns = vectorizer.get_feature_names()

In [12]:
# Print bow_df
bow_df

Unnamed: 0,aa,aaa,aaaaaaaaah,aaaaaaaahhhh,aaaaaah,aaaahhhs,aahs,aaliyah,aalyah,aamir,...,zuko,zukovsky,zulu,zundel,zurg,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
messg = data['reviews'].values
counts = vectorizer.fit_transform(messg)

In [14]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classes = data['target'].values
classifier.fit(counts, classes)

MultinomialNB()

In [15]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(classifier, counts, classes, cv=5)

# Print the accuracy of each fold:
print(scores)

# Print the mean accuracy of all 5 folds
print(scores.mean())

[0.805  0.825  0.805  0.8325 0.7825]
0.8099999999999999


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [16]:
# Create CountVectorizer object
vectorizers = CountVectorizer(ngram_range=(2,2))# (2,2) means we only search bigrams.
 # default range is (1,1), meaning we only search for unigrams. 
    # (1,2) means we search for both unigrams and bigrams. 

In [17]:
mess = data['reviews'].values
count = vectorizers.fit_transform(mess)

In [18]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(bow_df, 'X_bow.txt')

In [19]:
classifiers = MultinomialNB()
classe = data['target'].values
classifier.fit(counts, classe)

MultinomialNB()

In [20]:
scores = cross_val_score(classifiers, count, classe, cv=5)

# Print the accuracy of each fold:
print(scores)

# Print the mean accuracy of all 5 folds
print(scores.mean())

[0.7525 0.7525 0.7325 0.805  0.75  ]
0.7585


In [21]:
# Define the service key and endpoint of Azure Text Analytics
cog_key = 'key'
cog_endpoint = 'endpoint'

print('Ready to use cognitive services at {} using key {}'.format(cog_endpoint, cog_key))

Ready to use cognitive services at https://azurecognitiveservices-badewa.cognitiveservices.azure.com/ using key 068af73f1cfc40dd844c73421b251c63


In [22]:
data1=data[['reviews']]

In [23]:
data1

Unnamed: 0,reviews
0,plot two teen couple go church party drink dri...
1,happy bastard quick movie review damn yk bug g...
2,movie like make jaded movie viewer thankful in...
3,quest camelot warner bros feature length fully...
4,synopsis mentally unstable man undergoing psyc...
...,...
1995,wow movie everything movie funny dramatic inte...
1996,richard gere commanding actor always great fil...
1997,glory starring matthew broderick denzel washin...
1998,steven spielberg epic film world war ii unques...


In [24]:
data1['char_count']=data1['reviews'].apply(lambda x: len(x))
data1

Unnamed: 0,reviews,char_count
0,plot two teen couple go church party drink dri...,2147
1,happy bastard quick movie review damn yk bug g...,826
2,movie like make jaded movie viewer thankful in...,1767
3,quest camelot warner bros feature length fully...,1870
4,synopsis mentally unstable man undergoing psyc...,2762
...,...,...
1995,wow movie everything movie funny dramatic inte...,2487
1996,richard gere commanding actor always great fil...,1174
1997,glory starring matthew broderick denzel washin...,4205
1998,steven spielberg epic film world war ii unques...,2144


In [25]:
data1[data1.char_count > 1600]

Unnamed: 0,reviews,char_count
0,plot two teen couple go church party drink dri...,2147
2,movie like make jaded movie viewer thankful in...,1767
3,quest camelot warner bros feature length fully...,1870
4,synopsis mentally unstable man undergoing psyc...,2762
5,capsule planet mar police taking custody accus...,2500
...,...,...
1994,thriller set modern day seattle marked marky m...,1797
1995,wow movie everything movie funny dramatic inte...,2487
1997,glory starring matthew broderick denzel washin...,4205
1998,steven spielberg epic film world war ii unques...,2144


In [26]:
indexNames = data1[data1['char_count'] > 1600].index

In [27]:
data1.drop(indexNames , inplace=True)

In [28]:
data1

Unnamed: 0,reviews,char_count
1,happy bastard quick movie review damn yk bug g...,826
18,law crowd pleasing romantic movie state two le...,1415
29,one sided doom gloom documentary possible anni...,1128
36,among multitude erotic thriller released early...,1180
40,lengthy lousy two word describe boring drama e...,1027
...,...,...
1959,entertaining hour awaits audience film set ear...,1470
1976,robert redford river run film watch often mast...,728
1977,richard gere one favorite actor however like c...,1580
1987,tree lounge directoral debut one favorite acto...,649


In [29]:
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient
from msrest.authentication import CognitiveServicesCredentials
# Import necessary libraries
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
def authenticate_client():
    ta_credential = AzureKeyCredential(cog_key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=cog_endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [30]:
# Configure Azure Text Analytics client library

text_sentiment = []
key_phrase_extract = []

# Pass post text content to Azure Text Analytics and collect sentiment result
for index, headers in data1.iterrows():
    reviews_content = str(headers['reviews'])
    print("Review content: {}".format(reviews_content))
    documents = [reviews_content]
    response = client.analyze_sentiment(documents=documents, show_opinion_mining=True, language="en")[0]
    sentiment = response.sentiment
    print("review Content Sentiment: {}".format(sentiment))
    response_ekp = client.extract_key_phrases(documents = documents)[0]
    for phrase in response_ekp.key_phrases:
                print("Key phrase: {}".format(phrase))
                key_phrase_extract.append(phrase)
                print(key_phrase_extract)
    text_sentiment.append([reviews_content, sentiment, key_phrase_extract])

    key_phrase_extract = []
    # Convert collected post text content with sentiment to Pandas dataframes.
    text_sentiment = pd.DataFrame(text_sentiment, columns=['reviews','sentiment',
                                                                                 'key_phrase_extract'])


Review content: happy bastard quick movie review damn yk bug got head start movie starring jamie lee curtis another baldwin brother william time story regarding crew tugboat come across deserted russian tech ship strangeness kick power back little know power within going gore bringing action sequence virus still feel empty like movie going flash substance know crew really middle nowhere know origin took ship big pink flashy thing hit mir course know donald sutherland stumbling around drunkenly throughout hey let chase people around robot acting average even like curtis likely get kick work halloween h sutherland wasted baldwin well acting like baldwin course real star stan winston robot design schnazzy cgi occasional good gore shot like picking someone brain robot body part really turn movie otherwise pretty much sunken ship movie
review Content Sentiment: negative
Key phrase: baldwin course real star stan winston robot design schnazzy cgi
['baldwin course real star stan winston robot 

In [31]:
text_sentiment

Unnamed: 0,reviews,sentiment,key_phrase_extract
0,happy bastard quick movie review damn yk bug g...,negative,[baldwin course real star stan winston robot d...


⚠️ Please push the exercise once you are done 🙃

## 🏁 