# Learning from Big Data: Module 1 - Natural Language Processing
#### Session 2 - Supervised Learning

# Introduction
#### Illustration of the implementation of sentiment and content NBC-based methods.

# Installation
#### Based on the data available on the Github repo: https://github.com/guiliberali/Learning-from-Big-Data-Module-1 
+ Input files read from the local clone of the repo
+ Output files saved in a folder next to the local clone of the Github repo (in the root folder of the project)

# Packages

In [None]:
# Loading the required packages
import re
import string
import numpy as np
import pandas as pd
from collections import Counter, namedtuple
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# 1. The Naive Bayes Classifier (NBC) Functions

In [None]:
def compute_posterior_sentiment(prior, corpus_in, dict_words, p_w_given_c, TOT_DIMENSIONS):
    prior = np.array(prior)
    vec = CountVectorizer(vocabulary=dict_words, lowercase=True)
    word_matrix = vec.fit_transform([corpus_in]).toarray()

    # Check if there are any relevant words in the review, if there are, treat them. if not, use prior.
    if word_matrix.sum() == 0:
        posterior = prior
        words_ = ['']
    else:
        # Positions in word matrix that have words from this review
        word_matrix_indices = np.where(word_matrix > 0)[1]

        # Initializing posterior vector
        posterior = np.zeros(TOT_DIMENSIONS)
        vec_likelihood = np.zeros(TOT_DIMENSIONS)

        # Loop around words found in review
        for word_matrix_index in word_matrix_indices:
            word = vec.get_feature_names_out()[word_matrix_index]

            # Check if the word exists in p_w_given_c.words
            p_w_given_c_indices = np.where(p_w_given_c.words == word)[0]
            if p_w_given_c_indices.size > 0:
                p_w_given_c_index = p_w_given_c_indices[0]
                vec_likelihood = np.array([p_w_given_c.pos_likelihood[p_w_given_c_index], 
                                           p_w_given_c.neg_likelihood[p_w_given_c_index]])
                
                for i in range(word_matrix[0, word_matrix_index]):
                    numerat = prior * vec_likelihood
                    denomin = prior.dot(vec_likelihood)
                    posterior = numerat / denomin

                    if np.sum(posterior) > 1.01:
                        raise Exception('ERROR')

                    prior = np.array(posterior)

        words_ = vec.get_feature_names_out()[word_matrix_indices]

    return {'posterior_': posterior, 'words_': words_}


def compute_posterior_content(prior, corpus_in, dict_words, p_w_given_c, BIGRAM, TOT_DIMENSIONS):
    vec = CountVectorizer(vocabulary=dict_words, lowercase=True, ngram_range=(1, BIGRAM))
    word_matrix = vec.fit_transform([corpus_in]).toarray()

    # Check if there are any relevant words in the review, if there are, treat them. If not, use prior.
    if word_matrix.sum() == 0:
        posterior = prior
    else:
        # Positions in word matrix that have words from this review
        word_matrix_indices = np.where(word_matrix > 0)[1]
        posterior = np.zeros(TOT_DIMENSIONS)

        # Loop around words found in review
        for word_matrix_index in word_matrix_indices:
            word = vec.get_feature_names_out()[word_matrix_index]
            p_w_given_c_index = np.where(p_w_given_c.words == word)[0][0]
            vec_likelihood = np.array([p_w_given_c.storyline[p_w_given_c_index], 
                                       p_w_given_c.acting[p_w_given_c_index], 
                                       p_w_given_c.visual[p_w_given_c_index]])

            for i in range(word_matrix[0, word_matrix_index]):
                numerat = prior * vec_likelihood
                denomin = prior.dot(vec_likelihood)
                posterior = numerat / denomin

                if np.sum(posterior) > 1.01:
                    raise Exception('ERROR')

                prior = posterior

    return {'posterior_': posterior}

# 2. Start of the Main Code

## Loading the Review Data

In [3]:
# Loading the data
reviews_raw = pd.read_csv('../../data/reviews/reviews_tiny.csv', encoding='ISO-8859-1')
reviews_raw = reviews_raw[
    ['movie_name',
     'review_code',
     'reviewer',
     'review_date',
     'num_eval',
     'prob_sentiment',
     'words_in_lexicon_sentiment_and_review',
     'ratio_helpful',
     'raters',
     'prob_storyline',
     'prob_acting',
     'prob_sound_visual',
     'full_text',
     'processed_text',
     'release_date',
     'first_week_box_office',
     'MPAA',
     'studio',
     'num_theaters']
]

# Setting the parameters
PRIOR_SENT = 1/2
PRIOR_CONTENT = 1/3
TOT_REVIEWS = len(reviews_raw)

## Loading the Training Data

In [4]:
# Loading the storyline dictionary
dictionary_storyline = pd.read_csv('../../data/lexicons/storyline_33k.txt')

# Loading the acting dictionary
dictionary_acting = pd.read_csv('../../data/lexicons/acting_33k.txt')

# Loading the visual dictionary
dictionary_visual = pd.read_csv('../../data/lexicons/visual_33k.txt')

#### `TODO:` Compute the word likelihoods from the 3 content dictionaries (i.e., your training data). Here, we load a list of 100 words with **fake** topic/content likelihoods and a list with 100 **fake** sentiment likelihoods.

**Note** that these are just examples, and these 100-word lists are **not** to be used in your assignment, you are expected to compute the content likelihoods for all the words in the training data yourself.

In [5]:
# TODO: compute the content likelihoods for all the words in the training data...
likelihoods_content = pd.read_csv('../../data/lexicons/example_100_fake_likelihood_content.csv')

# Converting the first column to a list of strings
lexicon_content = likelihoods_content.iloc[:, 0].astype(str).tolist()

#### `TODO:` Search for a list of sentiment words that fits your *research question*. This is available from the literature.

**For example**, you may want to look at **positive** and **negative** sentiment (hence two dimensions) or you may want to look at other sentiment dimensions, such as specific **emotions** (excitement, fear, etc.). The list of 100 words with fake likelihoods for the sentiment used below is **not** to be used in your assignment.

In [6]:
# TODO: compute the sentiment likelihoods using your sentiment words list...
likelihoods_sentiment = pd.read_csv('../../data/lexicons/example_100_fake_likelihood_sentiment.csv')

# Converting the first column to a list of strings
lexicon_sentiment = likelihoods_sentiment.iloc[:, 0].astype(str).tolist()

## NBC Sentiment Analysis

In [7]:
for review_index in range(TOT_REVIEWS):
    if (review_index % 100 == 0):
        print(f"Computing sentiment of review #{review_index}")
        
    prior_sent = [PRIOR_SENT, 1-PRIOR_SENT]   # Reset the prior as each review is looked at separately
    text_review = str(reviews_raw['processed_text'].iloc[review_index])

    # Pre-process the review to remove punctuation marks and numbers
    # Note: we are not removing stopwords here (nor elsewhere - a point for improvement)
    text_review = text_review.translate(str.maketrans('', '', string.punctuation))
    text_review = ''.join([i for i in text_review if not i.isdigit()])

    # Computing posterior probability the review is positive
    TOT_DIMENSIONS = 2
    sent_results = compute_posterior_sentiment(prior=prior_sent,
                                               corpus_in=text_review,
                                               dict_words=lexicon_sentiment,
                                               p_w_given_c=likelihoods_sentiment,
                                               TOT_DIMENSIONS=TOT_DIMENSIONS)
    
    words_sent = sent_results['words_']
    posterior_sent = sent_results['posterior_']

    # Setting the posterior sentiment in the prob_sentiment column
    reviews_raw.loc[review_index, 'prob_sentiment'] = posterior_sent[0]
    reviews_raw.loc[review_index, 'words_in_lexicon_sentiment_and_review'] = ' '.join(words_sent)

Computing sentiment of review #0
Computing sentiment of review #100
Computing sentiment of review #200
Computing sentiment of review #300
Computing sentiment of review #400
Computing sentiment of review #500
Computing sentiment of review #600
Computing sentiment of review #700
Computing sentiment of review #800
Computing sentiment of review #900


## NBC Content Analysis

In [8]:
for review_index in range(TOT_REVIEWS):
    print(f'Computing content of review # {review_index}') if review_index%100 == 0 else None
    
    if reviews_raw['full_text'].iloc[review_index] != "":
        text_review = str(reviews_raw['processed_text'].iloc[review_index])

        # Pre-process the review to remove punctuation marks and numbers
        # Note: we are not removing stopwords here (nor elsewhere - a point for improvement)
        text_review = text_review.translate(str.maketrans('', '', string.punctuation))
        text_review = ''.join([i for i in text_review if not i.isdigit()])
        
        # Compute posterior probability the review is about each topic/content
        TOT_DIMENSIONS = 3
        prior_content = np.repeat(PRIOR_CONTENT, TOT_DIMENSIONS).reshape(-1, TOT_DIMENSIONS)
        posterior_content = compute_posterior_content(prior=prior_content, 
                                              corpus_in=text_review,
                                              dict_words=lexicon_content,
                                              p_w_given_c=likelihoods_content, 
                                              BIGRAM=2,
                                              TOT_DIMENSIONS=TOT_DIMENSIONS)
        
        reviews_raw.loc[review_index, 'prob_storyline'] = posterior_content['posterior_'][0][0]
        reviews_raw.loc[review_index, 'prob_acting'] = posterior_content['posterior_'][0][1]
        reviews_raw.loc[review_index, 'prob_sound_visual'] = posterior_content['posterior_'][0][2]

processed_reviews = reviews_raw

# Save the updated file, now including the sentiment and content/topic posteriors.
processed_reviews.to_csv('../../output/test_processed_reviews.csv', index=False)

Computing content of review # 0
Computing content of review # 100
Computing content of review # 200
Computing content of review # 300
Computing content of review # 400
Computing content of review # 500
Computing content of review # 600
Computing content of review # 700
Computing content of review # 800
Computing content of review # 900


## Performance: Confusion Matrix

In [9]:
# Loading the judges scores
ground_truth_judges = pd.read_csv('../../data/judges/judges.csv')

#### `TODO:` Compare the performance of your NBC implementation (for content) against the judges ground truth.

Do this by running your algorithm on the sentences labeled by the judges and comparing your classification against the ground truth. Provide the following:
+ The confusion matrix
+ The model precision
+ The accuracy score
+ **An interpretation of your findings**

In [10]:
# TODO: Implementation...

# 3. Vader Implementation

In [11]:
# Initializing for VADER
analyzer = SentimentIntensityAnalyzer()

for review_index in range(TOT_REVIEWS):
    if (review_index % 100) == 0: 
        print(f"Computing VADER sentiment of review #{review_index}")

    if reviews_raw.loc[review_index, 'full_text'] != "":
        text_review = reviews_raw.loc[review_index, 'processed_text']

        # Pre-process the review to remove numbers and punctuation marks.
        text_review = re.sub(r'\b\w{1,2}\b', '', text_review)
        text_review = re.sub(r'[^a-zA-Z ]+', ' ', text_review)
        text_review = ' '.join(text_review.split())

        # Analyze sentiment
        vader_scores = analyzer.polarity_scores(text_review)
        reviews_raw.loc[review_index, 'vader_pos'] = vader_scores['pos']

processed_reviews = reviews_raw
processed_reviews.to_csv('../../output/VADER_processed_reviews.csv', index=False)

Computing VADER sentiment of review #0
Computing VADER sentiment of review #100
Computing VADER sentiment of review #200
Computing VADER sentiment of review #300
Computing VADER sentiment of review #400
Computing VADER sentiment of review #500
Computing VADER sentiment of review #600
Computing VADER sentiment of review #700
Computing VADER sentiment of review #800
Computing VADER sentiment of review #900


## Vader vs. NBC comparison

#### `TODO:` Compare the performance of your NBC implementation (for sentiment) assuming that the VADER classification were the ground truth and then building the confusion matrix and computing precision and recall.

**Note** that we are now interested in understanding how and how much the two classifications differ (we are not implying that VADER is error-free). We are interested in uncovering sources of systemic differences that can be attributed to the algorithms or lexicons.
+ **Do interpret your findings.**

In [12]:
# TODO: Implementation...