In [173]:
import re
import nltk
import random
import numpy as np
import pandas as pd
from nltk import pos_tag
from sklearn.svm import SVC
from nltk.corpus import treebank
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')
nltk.download("stopwords")
nltk.download("movie_reviews")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

#Part1: POS Tagger for Treebank Corpus

In [119]:
treebank_sents = treebank.tagged_sents()                                        # Creating word vocabulary and tag set

words = set()
tags = set()

for sentence in treebank_sents:
  for word, tag in sentence:
    words.add(word)
    tags.add(tag)

tags.add('Start')
tags.add('End')

word2idx = {word: idx for idx, word in enumerate(words)}                         # Word-to-index and Index-to-Word Maps
idx2word = {idx: word for idx, word in enumerate(words)}

tag2idx = {tag: idx for idx, tag in enumerate(tags)}                             # Tag-to-index and Index-to-Tag Maps
idx2tag = {idx: tag for idx, tag in enumerate(tags)}

# Hidden Markov Model States, Viterbi Algorithm, and Smoothing for training on Different Corpora

In [121]:
def HMM_States(corpus):
  emission_counts = defaultdict(lambda: defaultdict(int))                       # Creating the emission and transition probability dictionaries by iterating through all the sentences in the corpus and calculating counts of emission
  tag_counts = defaultdict(int)                                                 # of words given tags and also transition from one tag to another for all (tag, word) & (prev_tag, curr_tag) pairs possible respectively
  emission_prob = defaultdict(lambda: defaultdict(float))

  for sentence in corpus:
    for word, tag in sentence:
      emission_counts[tag][word] += 1
      tag_counts[tag] += 1

  for sentence in corpus:
    for word, tag in sentence:
      emission_prob[tag][word] = emission_counts[tag][word]/tag_counts[tag]

  transition_counts = defaultdict(lambda: defaultdict(int))
  start_tag_counts = defaultdict(int)
  transition_prob = defaultdict(lambda: defaultdict(float))

  for sentence in corpus:                                                       # Calculating total number of tags that can appear right after Start
    prev_tag = None
    for _, tag in sentence:
      if prev_tag is not None:
        transition_counts[prev_tag][tag] += 1
      else:
        start_tag_counts[tag] += 1
      prev_tag = tag

  total_start_count = sum(start_tag_counts.values())

  for tag in tags:
    transition_prob['Start'][tag] = start_tag_counts[tag]/total_start_count     # Transition probability of Start -> Tag = count[tag] at start / total tags possible at start

  for prev_tag in transition_counts:
    total_transition_count = sum(transition_counts[prev_tag].values())
    for tag in transition_counts[prev_tag]:
      transition_prob[prev_tag][tag] = transition_counts[prev_tag][tag]
      transition_prob[prev_tag][tag] /= total_transition_count

  return emission_prob, transition_prob


def Viterbi(tokenized_sent, emission_prob, transition_prob):
  tokenized_sent.append('End')
  VITERBI = np.zeros((len(tags), len(tokenized_sent)+1))                        # VITERBI.shape = (len(tags)+2, len(tokenized_sent)+1), but since 'Start' and 'End' have already been added to tags and also 'End' has been appended to sent
  BACKPOINTER = np.zeros((len(tokenized_sent)))

  VITERBI[tag2idx['Start']][0] = 1                                               # The value of Start tag in the 1st column should be set to 1 in the VITERBI Matrix

  for i in range(1, len(tokenized_sent)+1):
    maxV = np.max(VITERBI[:, i-1])                                              # Starting from column number 2, maxV gets the maximum value present in the i-1th column of the VITERBI Matrix, and argV gets its index to be passed to the
    argV = np.argmax(VITERBI[:, i-1])                                           # BACKPOINTER to trace back the most probable POS tags path
    BACKPOINTER[i-1] = argV
    for j in range(len(tags)):
      VITERBI[j][i] = maxV
      VITERBI[j][i] *= transition_prob[idx2tag[argV]][idx2tag[j]];
      VITERBI[j][i] *= emission_prob[idx2tag[j]][tokenized_sent[i-1]];

  return BACKPOINTER

def apply_smoothing(emission_prob, new_words, tags, smoothing_factor=0.05):     # Added some smoothing to handle unseen words from Movie Dataset
  smoothed_emission_prob = {tag: dict(emission_prob[tag]) for tag in tags}

  for tag in tags:
    for word in new_words:
      if word not in smoothed_emission_prob[tag]:
        smoothed_emission_prob[tag][word] = smoothing_factor #/ (smoothing_factor * len(emission_prob[tag]) + len(new_words))
  return smoothed_emission_prob

In [136]:
treebank_pos_tags = [' '.join([f"{tag}" for word, tag in tagged_sentence]) for tagged_sentence in treebank.tagged_sents()]
emission_prob, transition_prob = HMM_States(treebank_sents)

##Generating POS tags for 10 sentences in the treebank corpus

In [137]:
for sent in range(10):
  tokens = treebank.sents()[sent]
  BACKPOINTER = Viterbi(tokens, emission_prob, transition_prob)
  decoded_tag_seq = " ".join([idx2tag[BACKPOINTER[i]] for i in range(1, len(BACKPOINTER))])

  sentence = " ".join(treebank.sents()[sent])

  print(f"Sentence        : {sentence}\nActual Sequence : {treebank_pos_tags[sent]}\nDecoded Sequence: {decoded_tag_seq}\n\n")

Sentence        : Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
Actual Sequence : NNP NNP , CD NNS JJ , MD VB DT NN IN DT JJ NN NNP CD .
Decoded Sequence: NNP NNP , CD NNS JJ , MD VB DT NN IN DT JJ NN NNP CD .


Sentence        : Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .
Actual Sequence : NNP NNP VBZ NN IN NNP NNP , DT NNP VBG NN .
Decoded Sequence: NNP NNP VBZ NN IN NNP NNP , DT JJ NN NN .


Sentence        : Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named *-1 a nonexecutive director of this British industrial conglomerate .
Actual Sequence : NNP NNP , CD NNS JJ CC JJ NN IN NNP NNP NNP NNP , VBD VBN -NONE- DT JJ NN IN DT JJ JJ NN .
Decoded Sequence: NNP NNP , CD NNS JJ CC JJ NN IN NNP NNP NNP NNP , VBD VBN -NONE- DT JJ NN IN DT JJ JJ NN .


Sentence        : A form of asbestos once used * * to make Kent cigarette filters has caused a high percentage of cancer deaths amon

#Part 2: Vanilla Sentiment Analysis Model

In [124]:
stemmer = PorterStemmer()                                                       # Method to pre-process the text by doing stemming and removing stopwords
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
  words = nltk.word_tokenize(text.lower())
  words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
  return ' '.join(words)

In [125]:
def clean_text(text):                                                           # Method to remove special characters and eliminate white space
  pattern = r'[^A-Za-z0-9\s]'
  cleaned_text = re.sub(pattern, '', text)
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
  cleaned_text = cleaned_text.strip()
  return cleaned_text

In [132]:
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]    # Loading the dataset
random.shuffle(documents)

texts = [' '.join(document) for document, _ in documents]
labels = [label for _, label in documents]

processed_text = [preprocess_text(clean_text(text)) for text in texts]

## Training the Vanilla Sentiment Analysis Model

In [133]:
X_train, X_test, y_train, y_test = train_test_split(processed_text, labels, test_size=0.4, random_state=42) # For the Vanilla sentiment analysis model, we split the data into train-test-val in the ratio 4:3:3
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Finally we use tf-idf to create word embeddings                               # tf-idf gave better results as compared to word2vec and glove. This is probably because this dataset doesn't require sequential information (context). Otherwise
tfidf_vectorizer = TfidfVectorizer(max_features=5000)                           # we would have had to use another model which can work with sequential data, like LSTM or Transformers
X_train = tfidf_vectorizer.fit_transform(X_train)
X_val = tfidf_vectorizer.transform(X_val)
X_test = tfidf_vectorizer.transform(X_test)

classifier = SVC()                                                              # Using a standard Support Vector Machine (gave better results as compared to Logit, Naive Bayes and Decision Trees)

classifier.fit(X_train, y_train)
val_predictions = classifier.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
val_classification_report = classification_report(y_val, val_predictions)

print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:\n", val_classification_report)

test_predictions = classifier.predict(X_test)

test_accuracy = accuracy_score(y_test, test_predictions)
test_classification_report = classification_report(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", test_classification_report)

Validation Accuracy: 0.8175
Validation Classification Report:
               precision    recall  f1-score   support

         neg       0.82      0.83      0.83       211
         pos       0.81      0.80      0.81       189

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

Test Accuracy: 0.8075
Test Classification Report:
               precision    recall  f1-score   support

         neg       0.80      0.80      0.80       190
         pos       0.82      0.81      0.82       210

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400



#Part 3: POS Tagging of Movie Dataset

##Since the POS model is trained on the Treebank corpus, we need to apply smoothing to the emission_prob matrix for it to be able to better handle unseen words from the new corpus (Movie Dataset)

###We notice that the performance of the Viterbi POS Tagger is quite poor on the movie reviews dataset. A possible explanation of this is that even after smoothing, the unseen words have very little probability and since the sentences are very long, it results in the probabilities multiplying and becoming very close to 0, due to which the tag starts to converge towards tag with tag index 0

In [134]:
for sentence in texts:
  for word in sentence.split():
    words.add(word)

emission_prob_smooth = apply_smoothing(emission_prob, words, tags)
print(emission_prob == emission_prob_smooth)

sent = texts[1]
sent_tokens = sent_tokenize(sent)
for sents in sent_tokens:
  token = word_tokenize(sents)
  BACKPOINTER = Viterbi(token, emission_prob_smooth, transition_prob)
  decoded_tag_seq = " ".join([idx2tag[BACKPOINTER[i]] for i in range(1, len(BACKPOINTER))])
  print(token)
  print(decoded_tag_seq)

False
['martin', 'scorsese', "'", 's', 'triumphant', 'adaptation', 'of', 'edith', 'wharton', "'", 's', 'the', 'age', 'of', 'innocence', 'is', 'a', 'stunning', 'film', 'for', 'the', 'quintessential', 'new', 'york', 'filmmaker', ',', 'the', 'man', 'who', 'brought', 'the', 'streets', 'of', 'taxi', 'driver', 'and', 'mean', 'streets', 'to', 'life', '.', 'End']
DT NN IN DT NN IN IN DT NN IN DT DT JJ IN DT VBZ DT NN IN DT DT NN JJ NN IN , DT JJ NN IN DT NN IN DT JJ CC NNP NNP TO VB .
['it', 'seems', 'like', 'an', 'odd', 'choice', 'for', 'scorsese', 'to', 'do', 'a', 'period', 'piece', 'in', 'the', 'early', '1900', "'", 's', ',', 'but', 'the', 'fact', 'that', 'he', 'pulls', 'it', 'off', 'so', 'brilliantly', 'is', 'a', 'wonder', ',', 'and', 'a', 'testament', 'to', 'the', 'greatness', 'of', 'scorsese', 'as', 'a', 'filmmaker', '.', 'End']
PRP VBD -NONE- DT NN IN DT NN TO VB DT JJ NNS IN DT NN IN DT NN , NNP DT JJ IN DT NN PRP VBD -NONE- TO MD MD MD MD MD MD MD MD MD MD MD MD MD MD MD MD
['this', '

# POS Tagging Movie Dataset using NLTK POS Tagger

In [186]:
pos_tagged_sentences = []                                                       # Generating pos tags for the entire movie dataset
for sent in processed_text:
  tokens = sent_tokenize(sent)
  tokens = word_tokenize(sent)
  pos_tagged_sentences.append(pos_tag(tokens))

sent_pos_tags = [" ".join([tag for _, tag in tags]) for tags in pos_tagged_sentences]

In [197]:
# Strategy and Explanation for Part 3

# 1.	Generate the POS tags for the dataset
# 2.	Use train-test-split to split texts as well as pos_tags into the following classes: -
# a.	X_train_text, X_val_text, X_test_text , y_train, y_test, y_val
# b.	X_train_tags, X_val_tags, X_test_tags
# 3.	Use 2 tf-idf vectorizers, one for embedding the texts and another for embedding the tags
# 4.	Stack the POS tags with their corresponding sentences to create a feature vector
# a.	X_train_combined = [X_train_text_tfidf, X_train_tags_tfidf]
# b.	X_val_combined = [X_val_text_tfidf, X_ val _tags_tfidf]
# c.	X_test_combined = [X_ test _text_tfidf, X_ test _tags_tfidf]
# 5.	Finally, run the same model on these vectors.


X_train_text, X_test_text, y_train, y_test = train_test_split(processed_text, labels, test_size=0.4, random_state=42)
X_val_text, X_test_text, y_val, y_test = train_test_split(X_test_text, y_test, test_size=0.5, random_state=42)

X_train_tags, X_test_tags = train_test_split(sent_pos_tags, test_size=0.4, random_state=42)
X_val_tags, X_test_tags = train_test_split(X_test_tags, test_size=0.5, random_state=42)

# Create TF-IDF vectorizers for text and POS tags
text_vectorizer = TfidfVectorizer(max_features=5000)
pos_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the text data
X_train_text_tfidf = text_vectorizer.fit_transform([sent for sent in X_train_text])
X_val_text_tfidf = text_vectorizer.transform([sent for sent in X_val_text])
X_test_text_tfidf = text_vectorizer.transform([sent for sent in X_test_text])

# Fit and transform the POS tag data
X_train_pos_tfidf = pos_vectorizer.fit_transform([tags for tags in X_train_tags])
X_val_pos_tfidf = pos_vectorizer.transform([tags for tags in X_val_tags])
X_test_pos_tfidf = pos_vectorizer.transform([tags for tags in X_test_tags])

# Combine the TF-IDF vectors for text and POS tags
X_train_combined = pd.concat([pd.DataFrame(X_train_text_tfidf.toarray()), pd.DataFrame(X_train_pos_tfidf.toarray())], axis=1) # The pos tags are vertically stacked beside their corresponding sentences as features[i] = [sentence[i], pos_tag[i]]
X_val_combined = pd.concat([pd.DataFrame(X_val_text_tfidf.toarray()), pd.DataFrame(X_val_pos_tfidf.toarray())], axis=1)
X_test_combined = pd.concat([pd.DataFrame(X_test_text_tfidf.toarray()), pd.DataFrame(X_test_pos_tfidf.toarray())], axis=1)

# Train a Support Vector Classifier (SVC)
classifier = SVC()
classifier.fit(X_train_combined, y_train)

val_predictions = classifier.predict(X_val_combined)
val_accuracy = accuracy_score(y_val, val_predictions)
val_classification_report = classification_report(y_val, val_predictions)

print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:\n", val_classification_report)

test_predictions = classifier.predict(X_test_combined)

test_accuracy = accuracy_score(y_test, test_predictions)
test_classification_report = classification_report(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", test_classification_report)

Validation Accuracy: 0.82
Validation Classification Report:
               precision    recall  f1-score   support

         neg       0.83      0.83      0.83       211
         pos       0.81      0.80      0.81       189

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

Test Accuracy: 0.8125
Test Classification Report:
               precision    recall  f1-score   support

         neg       0.80      0.81      0.80       190
         pos       0.83      0.81      0.82       210

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

