In [24]:
#built in librairies 
from tqdm import tqdm

# pip libraires
import numpy as np
import json
import pandas as pd
from time import sleep
from random import randint
import csv
import datetime
import time
from utils.genres import additional_wordsets

# visualization librairies
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Natural Language Processing Librairies
import nltk
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
import re
import spacy
from spacy import displacy
import spacy_transformers

# Visualization librairies
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Calculating libraries
import scipy as sp

#statistical librairies
import pingouin 

# Import libraries
import requests
from bs4 import BeautifulSoup
import pickle

# Others
from functools import partial
from ast import literal_eval

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Geopositioning and mapping
import geopandas as gpd 
import geopy 
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

In [25]:
#read necessary tables and process the data to a single dataframe

summaries=pd.read_csv('./data/plot_summaries.txt',sep="\t",header=None)
summaries.rename(columns={0:'wikipedia_movie_id',1:'Plot_summaries'},inplace=True)
movies=pd.read_table('./data/movie.metadata.tsv',header=None)
movies.rename(columns={0:'wikipedia_movie_id',1:'rebase_movie_ID',2:'Movie_name',3:'Movie_release',4:'Box_office_revenue',5:'Movie_runtime',6:'Movie_language',7:'Movie_country',8:'Movie_genre'},inplace=True)

df_summaries = pd.merge(summaries, movies[['wikipedia_movie_id', 'Movie_name']], on='wikipedia_movie_id')

## NLP for summary comprehension

The first step is to find a way to extract the countries and nationalities from the synopsys of every movie, as well as the sentiment for these countries. We decided to use Named Entity Recognition (NER) for this task. More specifically the transformer based pipeline provided by SpaCy.

In [22]:
#Initialise Vader sentiment analyzer and SpaCy NPC models

sentiment_analyzer = SentimentIntensityAnalyzer()
nlp_fast =spacy.load('en_core_web_sm')

# The trained transformer-based pipelines provided by spaCy are:en_core_web_trf
nlp = spacy.load('en_core_web_trf')

#Generate a small visualization of the plot summaries of the movie "Rambo"
doc = nlp(df_summaries.loc[13240]['Plot_summaries'])
displacy.render(doc, style="ent")

In [28]:
def clean_string(x:str)->list:
    """ Returns the lemmatized of every word in the input

    Args:
        x (str): Contains the input to be lemmatized

    Returns:
        list: a list of the words lemmatized
    """
    
    # Create a stemmer, the Lancaster Stemmer is used here
    doc = nlp_fast(x)
    # Return a list of Stemmed words
    return " ".join([token.lemma_.capitalize() for token in doc if not token.is_stop])

## Location Analysis

Once we have extracted the name of all the locations present in the synopsys we need to find it's country and location on a map. Therefore we use a geocode locator to extract the latitude and longitude, with a url request from the openstreetmap to get the country. This combinations allows us to not overcharge the geocode requests.

In [None]:
# Setup geocode and locator
user_agent = 'user_me_{}'.format(randint(10000,99999))
locator = Nominatim(user_agent=user_agent)
geocode = RateLimiter(locator.geocode, min_delay_seconds=1.2)

In [29]:
def find_country(country:str)->str:
    """ Returns the country of the location in the input

    Args:
        x (str): Name of the location (landmark, city, country...)

    Returns:
        str: the name of the country
    """
    address = geocode(country)
    if address is None:
        return "Other"
    lat = address.raw['lat']
    lon = address.raw['lon']
    url = f'https://nominatim.openstreetmap.org/reverse?lat={lat}&lon={lon}&format=json&accept-language=en&zoom=3'
    
    try:
        result = requests.get(url=url)
        result_json = result.json()
        return result_json['display_name']
    except Exception as e:
        return "Other"


In [30]:
def dict_mean(dict_list:list)->dict:
    """ Returns the mean value of each key in the dictionary list
    Args:
        x (list): list of dictionaries with the same keys

    Returns:
        dict: dictionary with the mean value for each key
    """
    mean_dict = {}
    for key in dict_list[0].keys():
        mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)
    return mean_dict

In [31]:
def find_movie_location_and_sentiment(summary:str)->object:
    """ Returns a series with the location and sentiment information for the input summary
    Args:
        x (str): summary of the movie

    Returns:
        pd.Series: A pandas series with the following information:
                    - who: a dictionary with the mean sentiment for each nationality in the summary
                    - where: a dictionary with the mean sentiment for each location in the summary
                    - countries: a dictionary with the mean sentiment for each country in the summary
                    - vs_all: a dictionary with the mean sentiment for the summary
    """
    where = {}
    who = {}
    countries = {}
    
    # Define the NER algorythm to be trasformer based
    doc = nlp(summary)

    # Analyse the summary with Vader to get a general sentiment
    vs_all = sentiment_analyzer.polarity_scores(summary)

    # Loop through the sentences in the summary
    for sent in doc.sents :

        # Analyse the sentiment of each sentence
        phrase_sentiment = sentiment_analyzer.polarity_scores(str(sent))

        # Loop through the entities in the sentence
        for ent in sent.ents :

            # If the entity is a location, add it to the where dictionary, and it's country to the countries dictionary
            if ent.label_ in ['GPE']:

                country = find_country(clean_string(ent.text))

                # If the location is already in the dictionary, add the sentiment to the list
                if clean_string(ent.text) in where:
                    where[clean_string(ent.text)].append(phrase_sentiment)
                else:
                    where[clean_string(ent.text)] = list([phrase_sentiment])

                # If the country is already in the dictionary, add the sentiment to the list
                if country in countries:
                    countries[country].append(phrase_sentiment)
                else:
                    countries[country] = list([phrase_sentiment])

            # If the entity is a Nationality, add it to the who dictionary
            if ent.label_ in ['NORP']:

                # If the nationality is already in the dictionary, add the sentiment to the list
                if clean_string(ent.text) in who:
                    who[clean_string(ent.text)].append(phrase_sentiment)
                else:
                    who[clean_string(ent.text)] = list([phrase_sentiment])

    # Calculate the mean sentiment for each repeated location
    for key,value in where.items():
        dict_mean(value)
        where[key] = dict_mean(value)
    
    # Calculate the mean sentiment for each repeated nationality
    for key,value in who.items():
        who[key] = dict_mean(value)

    # Calculate the mean sentiment for each repeated country
    for key,value in countries.items():
        countries[key] = dict_mean(value)

    return pd.Series([who, where, countries, vs_all])
 
# Apply the function to the dataframe 
df_summaries[['who', 'where', 'country', 'sentiment']] = df_summaries.Plot_summaries.apply(lambda x: find_movie_location_and_sentiment(x))

Since the geolocation takes a while due to the 1 sec wait, we created subsets of the dataframe that we later stitched together in order to better handle errors

In [None]:
# Setup subset sizes 
maxval = len(df_summaries)
step = 1000
i = 0

# Loop through the dataframe in subsets of 1000 rows
while i*step +step < maxval:
    print("Step ", i)
    # Change the user agent to avoid being blocked by the API
    user_agent = 'user_me_{}'.format(randint(10000,99999))
    locator = Nominatim(user_agent=user_agent)
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    # Process the subset and save it to a csv
    df_summaries_trunc = df_summaries[i*step:i*step + step]
    df_summaries_trunc[['who', 'where', 'country', 'sentiment']] = df_summaries_trunc.Plot_summaries.apply(lambda x: find_movie_location_and_sentiment(x))
    df_summaries_trunc.to_csv('tmp_stitches/country_sentiment_stitch_'+ str(i) + '.csv', index=True)
    i+=1

# Process the last subset not in the step size
df_summaries_trunc = df_summaries[i*step: maxval - i*step]
df_summaries_trunc[['who', 'where', 'country', 'sentiment']] = df_summaries_trunc.Plot_summaries.apply(lambda x: find_movie_location_and_sentiment(x))
df_summaries_trunc.to_csv('tmp_stitches/country_sentiment_stitch_'+ str(i) + '.csv', index=True)

In [12]:
# Stitch the subsets together

final_df = pd.read_csv('tmp_stitches/country_sentiment_stitch_'+ str(0) + '.csv',header=0, index_col=0)

for i in range(1,43):
    tmp_df = pd.read_csv('tmp_stitches/country_sentiment_stitch_'+ str(i) + '.csv',header=0, index_col=0)
    final_df = pd.concat([final_df, tmp_df], ignore_index=True)

# Check the final dataframe size for errors
print(len(final_df))

# Convert the read files to the correct format, mainly dictionaries
final_df[['who','where','country','sentiment']] = final_df[['who','where','country','sentiment']].applymap(literal_eval)

42204


In [13]:
# Save the final dataframe with the stitched subsets
final_df.to_csv('tmp_stitches/country_sentiment_stitch.csv', index=True)