# Movie Reviews

In [26]:
import pandas as pd
import re
from sklearn.pipeline import Pipeline
import string
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from  sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [27]:
# locating the default characters considered as punctuations.

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [28]:
import string
import re
# Define a function to remove punctuation in our messages
def no_punctuation(reviews):
    reviews = "".join([char for char in reviews if char not in string.punctuation])
    return reviews
def removelines(value):
    return ''.join(value.splitlines())
def lower_text(clean_text):
    # converting clean text to lowercase
    clean_text = clean_text.lower()
    return clean_text
def remove_num(clean_text):
    # remove numbers
    clean_textnonum = re.sub(r'\d+', '', clean_text)
    return clean_textnonum
data['reviews'] = data['reviews'].apply(lambda x: removelines(x))
data['reviews'] = data['reviews'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]"," ",x.lower().strip()))
data['reviews'] = data['reviews'].apply(lambda x: no_punctuation(x))
data['reviews']=data['reviews'].apply(lambda x: lower_text(x))
data['reviews']=data['reviews'].apply(lambda x: remove_num(x))
data.head()

Unnamed: 0,target,reviews
0,neg,plot two teen couples go to a church party ...
1,neg,the happy bastard s quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...
3,neg,quest for camelot is warner bros first...
4,neg,synopsis a mentally unstable man undergoing ...


In [29]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [30]:
stop_words.extend([ 'first', 'second', 'third', 'me', 'haha', 'lol', 'oof', 'cds'])#we added to our list of stopwords

In [31]:
# Lemmatize
import nltk
from nltk.tokenize import word_tokenize
def lemmatizing_text(clean_text):
    #words= nltk.word_tokenize(x)
    clean_text = clean_text.apply(lambda x: ' ' .join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x) if word not in stop_words]))
    return clean_text

data['reviews'] = lemmatizing_text(data['reviews'])

data.head()

    #return clean_tokens  

Unnamed: 0,target,reviews
0,neg,plot two teen couple go church party drink dri...
1,neg,happy bastard quick movie review damn yk bug g...
2,neg,movie like make jaded movie viewer thankful in...
3,neg,quest camelot warner bros feature length fully...
4,neg,synopsis mentally unstable man undergoing psyc...


## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [32]:
# Create CountVectorizer object
vectorizer = CountVectorizer()# (1,1) means we only search ngrams.

In [33]:
# Generate matrix of word vectors
bow = vectorizer.fit_transform(data['reviews'])

In [34]:
# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow.toarray())

In [35]:
# Map the column names to vocabulary
bow_df.columns = vectorizer.get_feature_names()

In [36]:
# Print bow_df
bow_df

Unnamed: 0,aa,aaa,aaaaaaaaah,aaaaaaaahhhh,aaaaaah,aaaahhhs,aahs,aaliyah,aalyah,aamir,...,zuko,zukovsky,zulu,zundel,zurg,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
messg = data['reviews'].values
counts = vectorizer.fit_transform(messg)

In [38]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classes = data['target'].values
classifier.fit(counts, classes)

MultinomialNB()

In [39]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(classifier, counts, classes, cv=5)

# Print the accuracy of each fold:
print(scores)

# Print the mean accuracy of all 5 folds
print(scores.mean())

[0.805  0.825  0.805  0.8325 0.7825]
0.8099999999999999


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [40]:
# Create CountVectorizer object
vectorizers = CountVectorizer(ngram_range=(2,2))# (2,2) means we only search bigrams.
 # default range is (1,1), meaning we only search for unigrams. 
    # (1,2) means we search for both unigrams and bigrams. 

In [41]:
mess = data['reviews'].values
count = vectorizers.fit_transform(mess)

In [42]:
classifiers = MultinomialNB()
classe = data['target'].values
classifier.fit(counts, classe)

MultinomialNB()

In [43]:
scores = cross_val_score(classifiers, count, classe, cv=5)

# Print the accuracy of each fold:
print(scores)

# Print the mean accuracy of all 5 folds
print(scores.mean())

[0.7525 0.7525 0.7325 0.805  0.75  ]
0.7585


⚠️ Please push the exercise once you are done 🙃

## 🏁 