# Import the necessary Modules and Methods

In [576]:

import pandas as pd
import string
import random
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from sklearn.metrics import classification_report

pd.set_option('display.max_colwidth', None)

# Reading the Data

In [577]:
all_tagged_movie_reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
# print(all_tagged_movie_reviews[:2])
cols = ['Review', 'Sentiment']
df = pd.DataFrame(all_tagged_movie_reviews, columns=cols)

# Cleaning The Data & Removing Irrelevant Words

In [578]:
lemmatizer = WordNetLemmatizer()

def clean_sentence(sentence):
    
    for word in sentence:
        word = word.lower()  # Case folding
    
    # # Remove numbers
    tokens = sentence
    stop_words = set(stopwords.words('english'))
    stop_words.add('\\n')
    tokens = [token for token in tokens if token not in string.punctuation]  # Removing irrelevant punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Removing irrelelvant stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatizing the words i.e changing the words to  their base form
    x = ' '.join(tokens)
    x.replace(" '", "'")
    return x

df['Review'] = df['Review'].apply(clean_sentence)


Getting Some Basic Information About the Dataset

In [579]:
print('Average number of words in each review:',int(np.mean(df['Review'].str.len().tolist())))
print('The number of positive reviews', df['Sentiment'].value_counts().to_list()[0])
print('The number of negative reviews', df['Sentiment'].value_counts().to_list()[1])

Average number of words in each review: 2441
The number of positive reviews 1000
The number of negative reviews 1000


# Splitting the Data in Test and Training

In [580]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

train_sents, test_sents = train_test_split(df['Review'],test_size=0.1,random_state=42)  # They are pd Series right now
train_rev , test_rev = train_test_split(df['Sentiment'],test_size=0.1,random_state=42)

In [581]:
train_rev = list(train_rev)  # Converting the pd Series in list datatype
test_rev = list(test_rev)

test_sents = list(test_sents)

train_sents = list(train_sents)
train_sents[:2]

['robert redford good playing character incredible god given gift able act like ordinary people natural 1984 played fallen angelic character roy hobbs baseball player destined best ever still acted like humble farm boy butch cassidy sundance kid 1969 dangerous shot west yet came like nothing boyishly good looking charmer even indecent proposal 1993 played man good making money believed could buy love redford still able exude aura shy decency especially reciting tale lost love latest film horse whisperer based best selling novel nicholas evans redford play character redford also directed film although swore would never direct movie suppose character tom booker man amazing gift understanding somehow communicating horse good pas despite enormous rare gift come training troubled horse magazine article coin phrase horse whisperer describe uncanny talent booker see everyday cattle rancher whose greatest fear growing old longer purpose life booker talent taxed service annie maclean kristin sc

In [582]:
test_sents[:2]

['verdict spine chilling drama horror maestro stephen king featuring outstanding oscar winning performance kathy bates geez french saunders field day set work parodying sorry non british reader may familiar french saunders apology pair british comedienne jennifer saunders later went become edina monsoon absolutely fabulous series film spoof year back including alien exorcist misery needle say amidst chucklesome impersonation kathy bates resemblance quite uncanny dawn french got pretty nasty sledgehammer reach jennifer saunder leg despite lingering memory sketch although seen film couple time hobbling scene le disturbing still left screaming telly revulsion may memorable scene certainly worth watching stephen king whose film tv adaptation tend vary quality strike gold simple yet strikingly compelling tale nicely crafted psychological horror effortlessly succeeds drawing plight writer phil sheldon james caan rescued car accident annie wilkes bates introduces writer number one fan soon be

# Getting the POS Tags from our POS Tagger

Getting the probabiliy values from json files

In [583]:
import json

with open('transition_prob.json','r') as json_file:
    transition_prob = json.load(json_file)
    
with open('emission_prob.json','r') as json_file:
    emission_prob = json.load(json_file)

with open('start_prob.json','r') as json_file:
    start_prob = json.load(json_file)

In [584]:
# Define the Viterbi algorithm for POS tagging
def viterbi(sentence, tags, transition_prob, emission_prob, start_prob):
    
    sentence = nltk.word_tokenize(sentence)
    V = [{}]  # Initialize the Viterbi matrix
    path = {}  # Store the best path
    
    # Initialization step: calculate probabilities for the first word
    for tag in tags:
        if sentence[0] not in emission_prob[tag]:
            emission_prob[tag][sentence[0]] = 1
        V[0][tag] = start_prob[tag] * emission_prob[tag][sentence[0]]
        path[tag] = tag
    
    # Base case for sentences with only one word
    if len(sentence) == 1:
        prob, tag = max((V[len(sentence) - 1][tag], tag) for tag in tags)
        return (path[tag], prob)
    
    # Recursion step: fill in the Viterbi matrix for the rest of the sentence
    for t in range(1, len(sentence)):
        V.append({tag: 0 for tag in tags})
        newpath = {}
        
        for tag in tags:
            if sentence[t] not in emission_prob[tag]:
                emission_prob[tag][sentence[t]] = 1
            # Find the best previous tag and its probability
            (prob1, state) = max((V[t-1][prev] * transition_prob[prev][tag] * emission_prob[tag][sentence[t]], prev) for prev in tags)
            V[t][tag] = prob1  # Update the probability in the Viterbi matrix
            newpath[tag] = path[state] + " " + tag  # Update the best path
        
        # Normalize probabilities to prevent underflow
        for tag in tags:
            if all(value < 1e-5 or value == 0 for value in V[t].values()):
                V[t] = {k: v * 1000 for k, v in V[t].items()}

        path = newpath  # Update the best path
    
    # Termination step: find the best tag for the last word
    prob, tag = max((V[len(sentence) - 1][tag], tag) for tag in tags)
    path_list = path[tag].split()  # Convert the best path to a list of tags
    return path_list


In [585]:
tags = set(transition_prob.keys())
pos_tags = []

for train_sent in train_sents:
    pos_tag = viterbi(train_sent,tags,transition_prob,emission_prob,start_prob)  #Creating list of pos tags of training sentences
    pos_tags.append(pos_tag)


pos_tags1 = []

for test_sent in test_sents:
    pos_tag = viterbi(test_sent,tags,transition_prob,emission_prob,start_prob)  #Creating list of pos tags of test sentences
    pos_tags1.append(pos_tag)

In [586]:
# Joining sentences and pos tags for training data
tokens = [train_sent.split() for train_sent in train_sents]
tokens = [[word + tag.lower()  for word,tag in zip(train_sent,pos_tag)] for train_sent,pos_tag in zip(tokens,pos_tags)]

train_sents = [' '.join(token) for token in tokens]
train_sents[:2]

['robertnoun redfordadp goodadj playingnoun characternoun incredibleadj godnoun givenverb giftdet ableadj actnoun likeadp ordinaryadj peoplenoun naturaladj 1984noun playedverb fallenverb angelicadj characternoun roynoun hobbsadp baseballnoun playernoun destinedverb bestadv everadv stilladv actedverb likeadp humbleadj farmnoun boynoun butchadp cassidydet sundanceadj kidnoun 1969adp dangerousadj shotnoun westnoun yetadv cameverb likeadp nothingnoun boyishlyadp goodadj lookingverb charmeradv evenadv indecentadj proposalnoun 1993prt playedverb mannoun goodadj makingnoun moneynoun believedverb couldverb buyverb lovenoun redford. stilladv ableadj exudenoun auraadp shyadj decencynoun especiallyadv recitingadj talenoun lostverb loveverb latestadj filmnoun horsenoun whisperer. basedverb bestadj sellingnoun novelnoun nicholasnoun evansnoun redfordadp playnoun characternoun redford. alsoadv directedverb filmnoun althoughadp sworepron wouldverb neveradv directadj movienoun supposeverb characternou

In [587]:
# Joining sentences and pos tags for testing data
tokens = [test_sent.split() for test_sent in test_sents]
tokens = [[word + tag.lower()  for word,tag in zip(test_sent,pos_tag)] for test_sent,pos_tag in zip(tokens,pos_tags1)]

test_sents = [' '.join(token) for token in tokens]
test_sents[:2]

['verdictnoun spinenoun chillingverb dramanoun horrornoun maestronoun stephennoun kingnoun featuringverb outstandingadj oscarnoun winningverb performancenoun kathynoun batesadp geezdet frenchadj saundersnoun fieldnoun daynoun setverb workverb parodyingdet sorryadj nonadj britishadj readernoun mayverb familiaradj frenchadj saundersnoun apologynoun pairnoun britishnoun comedienne. jenniferpron saundersverb lateradv wentverb becomeverb edinadet monsoonnoun absolutelyadv fabulousadj seriesnoun filmnoun spoofadp yearnoun backadv includingadp alienadj exorcistnoun miserynoun needlenoun sayverb amidstadp chucklesomedet impersonationadj kathynoun batesnoun resemblancenoun quiteadv uncannyadj dawnnoun frenchnoun gotverb prettyadv nastyadj sledgehammernoun reachverb jenniferadp saunderdet legnoun despiteadp lingeringdet memorynoun sketchnoun althoughadp seenverb filmnoun couplenoun timenoun hobblingadp scenenoun lenoun disturbingverb stilladv leftverb screamingverb tellydet revulsionnoun mayverb

# TF-IDF and BoW Feature Extraction

In [598]:
# vectorizer using Tfidf vectorizer
vectorizer = TfidfVectorizer(max_features=60000)

#Extracting features from the sentences
features_tfidf_train = vectorizer.fit_transform(train_sents)
features_tfidf_test = vectorizer.transform(test_sents)

#Getting the vectors in a dataframe
tfidf_df = pd.DataFrame(features_tfidf_train.toarray(),columns=vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,000,0009fadp,000adp,000det,000noun,000num,000verb,007,007adp,007det,...,zundelpron,zurgadp,zweibeladp,zwick,zwickadp,zwickdet,zwicknoun,zwickverb,zwigoffadp,zyciedet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.025607,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.037467,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [596]:
tag_weights = {       #Weights given to each pos tag
    'noun': 0.1,
    'verb': 11,
    'adj': 11,
    'det': 1,
    'num': 0,
    '.': 0,
    'x': 0,
    'conj': 0,
    'adp': 0,
    'pron': 0
}

In [597]:
for col in tfidf_df.columns:
    for tag in tag_weights:
        if tag in col:
            tfidf_df[col] *= tag_weights[tag]  # Multiplying the weights of the associated pos tags

features_tfidf_train = tfidf_df.values

In [594]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(features_tfidf_train, train_rev)  

# Predict the labels
rev_pred = nb_classifier.predict(features_tfidf_test)

# Calculate accuracy
accuracy = accuracy_score(test_rev, rev_pred)

# Generate classification report
report = classification_report(test_rev, rev_pred)

# Print results
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')



Accuracy: 0.795
Classification Report:
              precision    recall  f1-score   support

         neg       0.77      0.83      0.80        99
         pos       0.82      0.76      0.79       101

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.79       200

