# MOVIES REVIEWS AND SENTIMENT ANALYSIS DATASETS

In [None]:
import codecs
import urllib
import urllib2
import nltk
import re
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import json
from __future__ import division
import io
import numpy as np
import html2text
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from operator import itemgetter
import codecs

## Movies reviews

The movie review are a list of html files saved in a local folder. The first step is to get all the file names.

In [None]:
filename_list = []
#direct = "file:///C:\Users\s161328\Downloads\movie\\"
direct = "/Users/Ferran/Downloads/movie/"
for f in listdir(direct):
    if f.endswith(".html"):
        filename_list.append(re.search(r'(.*?)\.html', f).group(1))

Next, the title of each movie is extracted. In this database, some titles have a reversed order (e.g. "Godfather, The" instead of "The Godfather"). In this cases, the title order is exchanged.

In [None]:
articles_list = ["The", "A", "Les", "Il", "El", "Eles", "La", "Los", "Las", "De", "Das", "Den", "Die", "Det", "Der", 
                 "An", "Le", "L'", "Os", "Una", "O", "al-", "Lo"]
# Title filtered contains the title for each review, and if necessary, the title is put in the correct order
title_filtered = []
for f in filename_list:
    try:
        path = direct + f + ".html"
        
        #t = urllib2.urlopen(path).read()
        t = codecs.open(path,'r').read()
        
        soup = BeautifulSoup(t,'html.parser')
        # The title of the movies has the format Review for __title__ (__year__)
        title = re.search(r'Review for (.+) \(', soup.title.string).group(1)
        try:
            # If a title contains a comma, and the right part is contained in articles list, the order is exchanged.
            aux = re.search(r'(.+), (.+)', title)
            before_comma = aux.group(1)
            after_comma = aux.group(2)
            if after_coma in articles_list:
                title = after_comma + ' ' + before_comma
        except:
            pass
        title_filtered.append(title)
    except:
        print("No title")

The last step is to extract the review text. In the following part of the code, paragraph with review text are cleaned and joined in a single string. Finally, for the movies that already exist in the database, the list of reviews is added.

In [None]:
# We start loading the movies dictionary obtained from Wikipedia
#with open("C:\Users\s161328\Downloads\movies_dict.txt") as f:
with open("movies_dict.txt") as f:
    movies_dict = json.loads(f.read())
movies_dict_keys = movies_dict.keys()
movies_dict_keys_proc = [re.sub( ":", "", m.lower()) for m in movies_dict_keys]
total = 0

# We add to each movie, two extra keys, "Reviews" and "Reviews grade"
for movie in movies_dict.keys():
    movies_dict[movie]["Reviews"] = []
    movies_dict[movie]["Reviews grade"] = ""
i = 0

for title in title_filtered:
    if re.sub(":", "", title.lower()) in movies_dict_keys_proc:
        path = direct + filename_list[i] + ".html"
        #t = urllib2.urlopen(path).read()
        t = codecs.open(path,'r').read()
        soup = BeautifulSoup(t,'html.parser')
        # From each review we extract all paragraph, that contain the movie review
        page_paragraphs = [str(x) for x in list(soup.find_all('p'))]
        clean_paragraphs = []
        # We clean each paragraph, and if it contains the Copyright information, the paragraph is discarded
        for paragraph in page_paragraphs:
            try:
                text = re.search(r'<p>(.+?)</p>', paragraph, re.DOTALL).group(1)
                if (re.search("Copyright \d+",text) == None):
                    clean_paragraphs.append(text.strip())
            except:
                pass 
        # All paragraphs are joined and cleaned from html characters
        review_text = html2text.html2text("\n".join(clean_paragraphs).decode("utf-8"))
        j = movies_dict_keys_proc.index(re.sub(":", "", title.lower()))
        # Reviews texts are added to the dictionary
        movies_dict[movies_dict_keys[j]]["Reviews"].append(review_text)
        i += 1
# The resulting dictionary is saved in a txt file
json.dump(movies_dict, open("movies_dict_with_reviews.txt",'w'))

## Sentiment analysis

In this part, sentiment is calculated based on the reviews added to the dictionary in the previous section. Moreover, instead of using the dataset used in week 7 for calculating sentiment, we will generate the list of most important words for positive and negative reviews (with the associated value) given a list of highly polar reviews (saved in txt files).

Some functions have to be defined for calculating sentiment:

In [None]:
# Calculates the sentiment of a text, given a dictionary that matches words with punctuation
def calculate_sentiment(text, sentiment_dict):
    token_list = preprocess(text)
    sent_keys_lower = [s.lower() for s in sentiment_dict.keys()]
    intersect = [i for i in token_list if i in sent_keys_lower] #with repetitons
    if (intersect == []): 
        print "There are no sentiment words in the token list"
        return 0
    return sum([sentiment_dict[i] for i in intersect])/len(token_list)

# Processess and tokenizes a text
def preprocess(raw):
    
    #We set the text to lowercase
    raw = raw.lower()
    
    #We keep only the words
    tokens = re.findall(r'[a-z]+', raw)
    
    #We create a exclusion list with the English stopwords
    exclusion = stopwords.words('english')
    
    #We remove the exclusion list from the tokens
    filtered_words = [w for w in tokens if not w in exclusion]
    
    #We remove morphological affixes from words, leaving only the word stem
    stemmer = SnowballStemmer("english")
    filtered_words_stemmer = [stemmer.stem(t) for t in filtered_words]
    
    return filtered_words_stemmer

#Function that returns a dictionary with the words as a key and their tf-idf value as a value
def tfidf(words_counter, num_docs_containing, num_docs):
    
    #Dictionary with the words as a key and their tf-idf as a value
    dict_tfidf = {key: "" for key in words_counter.keys()}
    
    for word in words_counter.keys():
    
        tf = words_counter[word]
        try:
            idf = np.log(num_docs/(1 + num_docs_containing[word]))
        except KeyError:
            idf = np.log(num_docs)
        
        dict_tfidf[word] = tf * idf
        
    return dict_tfidf

First of all, the text of all reviews is loaded and saved into a list, for both positive and negative reviews. Also, a string with the joined list is saved.

In [None]:
# Negative review is a list containing all negative reviews, and negative_reviews_str joins this list as a string
negative_reviews = []
#path = "C:/Users/s161328/Downloads/movies_reviews_classified/neg"
path = "/Users/Ferran/Downloads/movies_reviews_classified/neg"
for f in listdir(path):
    if f.endswith(".txt"):
        with open("%s/%s" %(path,f)) as neg_review:
            # & character caused problems in html2text function (interpreted as EOF). Therefore, it was deleted.
            neg_review_clean = re.sub(r"\&","",neg_review.read())
            negative_reviews.append(html2text.html2text(neg_review_clean.decode("utf-8")))
negative_reviews_str = " ".join(negative_reviews)

In [None]:
# Positive review is a list containing all negative reviews, and positive_reviews_str joins this list as a string
positive_reviews = []
#path = "C:/Users/s161328/Downloads/movies_reviews_classified/pos"
path = "/Users/Ferran/Downloads/movies_reviews_classified/pos"
for f in listdir(path):
    if f.endswith(".txt"):
        with open("%s/%s" %(path,f)) as pos_review:
            pos_review_clean = re.sub(r"\&","",pos_review.read())
            positive_reviews.append(html2text.html2text(pos_review_clean.decode("utf-8")))
positive_reviews_str = " ".join(positive_reviews)

We use TF-IDF to generate the sentiments words dictionaries, that assigns a value to a word according to the occurrence in positive and negative reviews. For this reason, we need to know, the number of times a word is used in the same-polarity reviews and the number of documents of the opposite polarity that contain this word. With all this information, we build two dictionaries, that give a punctuation to each word as a positive or as a negative review.

In [None]:
# Processes and tokenizes the string with all the positive reviews
positive_words_tokens = preprocess(positive_reviews_str)

# Counts the number of docs that contain each word in positive_words_tokens in the list of negative reviews 
num_docs_containing_pos = Counter()
positive_reviews_words_set = set(positive_words_tokens)
negative_reviews_set = [set(preprocess(negative_reviews[i])) for i in xrange(len(negative_reviews))]

for neg_word_token in negative_reviews_words_set:
    for j in xrange(len(positive_reviews)):
        if neg_word_token in positive_reviews_set[j]:
            num_docs_containing_neg[neg_word_token] += 1
json.dump(movies_dict, open("num_docs_containing_neg.txt",'w'))

In [None]:
# Processes and tokenizes the string with all the negative reviews
negative_words_tokens = preprocess(negative_reviews_str)

# Counts the number of docs that contain each word in negative_words_tokens in the list of positive reviews 
num_docs_containing_neg = Counter()
negative_reviews_words_set = set(negative_words_tokens)
positive_reviews_set = [set(preprocess(positive_reviews[i])) for i in xrange(len(positive_reviews))]

for pos_word_token in positive_reviews_words_set:
    for j in xrange(len(negative_reviews)):
        if pos_word_token in negative_reviews_set[j]:
            num_docs_containing_pos[pos_word_token] += 1
json.dump(movies_dict, open("num_docs_containing_pos.txt",'w'))

In [None]:
positive_words_counter = Counter(positive_words_tokens)
json.dump(movies_dict, open("positive_words_counter.txt",'w'))

negative_words_counter = Counter(negative_words_tokens)
json.dump(movies_dict, open("negative_words_counter.txt",'w'))

In [None]:
# Most of the code above takes much time to finish. For this reason we saved resulting dictionaries in txt files.
# In this way, we can load the results and continue working another day

with open("num_docs_containing_neg.txt") as f:
    num_docs_containing_neg = json.loads(f.read())

with open("num_docs_containing_pos.txt") as f:
    num_docs_containing_pos = json.loads(f.read())
    
with open("positive_words_counter.txt") as f:
    positive_words_counter = json.loads(f.read())

with open("negative_words_counter.txt") as f:
    negative_words_counter = json.loads(f.read())

In [None]:
# Calculation of both dictionaries, with values for posive and negative words
tf_idf_neg = tfidf(negative_words_counter, num_docs_containing_neg, sum(negative_words_counter.values()))
tf_idf_pos = tfidf(positive_words_counter, num_docs_containing_pos, sum(positive_words_counter.values()))
max_tf_idf_neg = max(tf_idf_neg.values())
max_tf_idf_pos = max(tf_idf_pos.values())
max_tf_idf = max([max_tf_idf_neg, max_tf_idf_pos])
sentiment_dict_neg = {tf_idf:-100*tf_idf_neg[tf_idf]/max_tf_idf for tf_idf in tf_idf_neg.keys()}
sentiment_dict_pos = {tf_idf:100*tf_idf_pos[tf_idf]/max_tf_idf for tf_idf in tf_idf_pos.keys()}

Finally, the sentiment of each review is calculated as the addition of sentiment in both dictionaries, and the average of sentiment among the reviews in a movie is saved in "Reviews grade".

In [None]:
with open("movies_dict_with_reviews.txt") as f:
    movies_dict = json.loads(f.read())
# Given both sentiment dictionaries, the sentiment analysis for each movie with reviews is performed and the result is
# saved in the "Reviews grade" key of the corresponding movie
for movie in movies_dict.keys():
    num_rev = len(movies_dict[movie]["Reviews"])
    total_sent = 0
    num_empty_rev = 0
    if num_rev > 0:
        for rev in movies_dict[movie]["Reviews"]:
            aux = calculate_sentiment(rev, sentiment_dict_pos)
            aux2 = calculate_sentiment(rev, sentiment_dict_neg) 
            # The sentiment of a specific review is the addition of the sentiment of positive and negative words
            if(aux + aux2 != 0): total_sent = total_sent + aux + aux2
            else: num_empty_rev += 1
        movies_dict[movie]["Reviews grade"] = total_sent/(num_rev - num_empty_rev)
json.dump(movies_dict, open("movies_dict_final.txt",'w'))