This notebook follows the plan:
- Import the modules
- Import the "basic" data (movies and characters datasets from CMU), clean it and save it
- Extraction of the lemmatized version of the plot summaries from the corenlp processed data
- Processing of the summaries according to the gender
- Loading, cleaning of IMDb dataset
- Matching CMU and IMDb datasets

# Import the modules

In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
from time import time
import os
import gzip
import re

In [2]:
# Download useful packages for nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Pierre\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already u

True

# Import the data

In [3]:
# File and folder names
DATA_FOLDER = 'Data/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'
MOVIE_DATASET = DATA_FOLDER + 'movie.metadata.tsv'

SUMMARIES_DATASET = DATA_FOLDER + 'plot_summaries.txt'
NLP_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries/'
DEFAULT_COMPRESSION = 'gzip'

In [4]:
# Function to load data
def load_metadata(path, column_names, header=None, low_memory=False):
    return pd.read_table(path, header=header, names=column_names)

In [5]:
# Name columns
columns_character = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_release_date', 'Character_name', 'Actor_date_of_birth', 'Actor_gender', 'Actor_height_meters', 'Actor_ethnicity_Freebase_ID', 'Actor_name', 'Actor_age_at_movie_release', 'Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']
columns_movie = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name','Movie_release_date','Movie_box_office_revenue', 'Movie_runtime','Movie_languages','Movie_countries','Movie_genres' ]

# Load data with correct column names
characters = load_metadata(CHARACTER_DATASET,column_names=columns_character)
movies = load_metadata(MOVIE_DATASET,column_names=columns_movie)

In [6]:
# Load summaries
summaries = pd.read_csv(SUMMARIES_DATASET, sep='\t', header=None, names=['id', 'plot'])

# Cleaning

## Problem of dates

We fix typos and absurd dates

In [7]:
movies.loc[movies.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters.loc[characters.Movie_release_date == '1010-12-02','Movie_release_date'] = '2010-12-02'
characters[characters.Actor_date_of_birth == '2050'] = '1971'
characters = characters.drop(characters[characters.Actor_date_of_birth < '1500'].index)
characters = characters.drop(characters[characters.Actor_date_of_birth > '2030'].index)

## Format of movie languages, genres and country

Convert the format of languages, genres, country columns to a simpler format (in terms of utilisation).

In [8]:
def format_multiple(chain,deb,step):
    '''Split the chain of characters at each " encountered, and keep only the element in deb +i*step'''
    res = chain.split('"')[deb::step]
    return res

In [9]:
movies.loc[:,'Movie_genres'] = movies.Movie_genres.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_countries'] = movies.Movie_countries.apply(format_multiple,deb=3,step=4)
movies.loc[:,'Movie_languages'] = movies.Movie_languages.apply(format_multiple,deb=3,step=4)

In [10]:
keys = ['Movie_languages','Movie_countries','Movie_genres']
for key in keys:
    nb = len(movies[movies[key].apply(len) == 0])
    print('{nb} movies without {key} ({percentage:.2f}% of the dataset)'.format(nb=nb,key=key, percentage=nb*100/len(movies)))

13866 movies without Movie_languages (16.96% of the dataset)
8154 movies without Movie_countries (9.98% of the dataset)
2294 movies without Movie_genres (2.81% of the dataset)


## Format for dates

For our study, we only keep the years from the dates.

In [11]:
movies.Movie_release_date = pd.to_datetime(movies.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Movie_release_date = pd.to_datetime(characters.Movie_release_date,format='%Y-%m-%d').dt.year
characters.Actor_date_of_birth = pd.to_datetime(characters.Actor_date_of_birth,format='%Y-%m-%d',utc=True,errors='coerce').dt.year

## Saving the new dataset

We pickle our data in order to reuse directly the cleaned data (and load it faster).

In [12]:
DESTINATION = './Data/'
EXT = '.pkl'
to_pickle_data = [characters,movies]
to_pickle_name = ['characters','movies']
for i in range(len(to_pickle_data)):
    to_pickle_data[i].to_pickle(DESTINATION+to_pickle_name[i]+EXT)

# # To unpickle:
# characters = pd.read_pickle("./Data/characters.pkl") 
# movies = pd.read_pickle("./Data/movies.pkl")

In [13]:
# Find 10 most common genres
topx = 10
genres = movies.Movie_genres.apply(pd.Series).stack().value_counts()[:topx]

print(genres.index)


  genres = movies.Movie_genres.apply(pd.Series).stack().value_counts()[:topx]


Index(['Drama', 'Comedy', 'Romance Film', 'Black-and-white', 'Action',
       'Thriller', 'Short Film', 'World cinema', 'Crime Fiction', 'Indie'],
      dtype='object')


In [14]:
# Obtain the list of movies id for each genre
movies_id = {}
for genre in genres.index:
    movies_id[genre] = movies[movies.Movie_genres.apply(lambda x: genre in x)].Wikipedia_movie_ID.values

# save results in a pickle file
with open(DATA_FOLDER + 'movies_id_per_genres.pkl', 'wb') as f:
    pickle.dump(movies_id, f,protocol=pickle.HIGHEST_PROTOCOL)

# Lemmatizing the summaries

We lemmatize data (for examples *'is'* becomes *'be'*) to be able to count words better. To do so, we used the `corenlp_plot_summaries` files, and exctracted from it the lemmatized versions of the movies summaries.

In [15]:
# Set to True to save the data
LEMMATIZE_SUMMARIES = False # Takes ~7 mins to run (on i7-10875H CPU)

if LEMMATIZE_SUMMARIES:
    # Count the number of files in the directory
    nb_files = 0
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename)
        nb_files += 1
    print('Number of summaries:',nb_files)

    ext = '.xml.gz' # Extension name
    dico_processed_summmaries = {} # Dictionary to store the processed summaries
    regex = r'<lemma>.*?</lemma>' # Expression to detect in the corenlp data <lemma>(word)</lemma>

    deb = time() # Start timer
    count = 0 # Counter

    # Iteration over the files
    for filename in os.listdir(NLP_FOLDER):
        path = os.path.join(NLP_FOLDER, filename) # Path to the file
        id_summary = path[len(NLP_FOLDER):-len(ext)] # id of the summary = filename without extension
        summary = '' # String to store the summary

        if os.path.isfile(path): # Checking if it is a file
            with gzip.open(path, 'rb') as f: # Opening the .gz file
                for line in f:
                    txt = line.decode().strip() # Extract the line as txt
                    for elt in re.finditer(regex,txt): # Find all the elements like regex
                        summary += re.split('[><]',elt.group(0))[2].lower() + ' ' # Adding only the lemmatized word
        
        # Set the summary in the dictionary and increment the counter
        dico_processed_summmaries[id_summary] = summary
        count += 1

        # Evolution of the process
        if count%1000 == 0:
            print('{processed}/{tot} files processed --> {perc:.1f}% ({t:.1f} seconds since deb)'.format(processed=count,tot=nb_files,perc=count/nb_files*100,t=time()-deb))
    
    # Pickle the file
    with open(DATA_FOLDER + 'nlp_summaries.pkl', 'wb') as file:
        pickle.dump(dico_processed_summmaries, file, protocol=pickle.HIGHEST_PROTOCOL)

Let us try to extract the data:

In [16]:
# Read the pickle file
nlp_summaries = pd.read_pickle(DATA_FOLDER+'nlp_summaries.pkl')

# Observe the first lemmatized summary
for key,value in nlp_summaries.items():
    print('Key:',key)
    print('Summary:\n',value[:200]+'...')
    break

Key: 10000053
Summary:
 Fur trapper Jean La B te paddle he canoe through wild water towards the settlement in order to sell a load of fur . at the settlement a steamboat be landing and the trader and he foster-child Eve , ar...


# Separating sentences between sexes

The aim of this part is to separate sentences between sexes to do a sentimental analysis later. To do so, we check if a feminine actor or the *'she'* pronoun is present in a sentence and add them to a new file. We do the same for a male actor and the *'he'* pronoun. Note that for example the sentence *'She hates him'* will become *'she hate he'* once lemmatized, which will be put in the feminine and maculine files

This approach is not perfect, since for example in the sentences 'She likes butter. Indeed, the actress loves food.', only the first one will be added. It is not perfect, but the best solution we could think of.

In [17]:
# Create a dataframe with the characters
characters_per_film = characters.copy()
# Put the column in their correct type and lower chars
characters_per_film['Wikipedia_movie_ID'] = characters_per_film['Wikipedia_movie_ID'].astype(int)
characters_per_film['Character_name'] = characters_per_film['Character_name'].astype(str).apply(lambda x: x.lower())
# Sort the dataframe by movie ID
characters_per_film = characters_per_film.sort_values(by=['Wikipedia_movie_ID'])
# Drio rows where the character name or the gender is empty
characters_per_film = characters_per_film.dropna(subset=['Character_name', 'Actor_gender'])
# Group the dataframe by movie ID
characters_per_film = characters_per_film.groupby('Wikipedia_movie_ID')[['Wikipedia_movie_ID', 'Character_name', 'Actor_gender']]

In [18]:
# Import dataframe from lemmatized summaries
df_lem = pd.DataFrame(list(nlp_summaries.items()), columns = ['id','plot_lemmatized'])
# Put column in their correct type
df_lem['id'] = df_lem['id'].astype(int)
# Sort the dataframe by movie ID
df_lem = df_lem.sort_values(by=['id'])
# Show the first 5 rows
df_lem.head()

Unnamed: 0,id,plot_lemmatized
27884,330,in order to prepare the role of a important ol...
26866,3217,"after be pull through a time portal , Ash Will..."
28281,3333,the film follow two juxtapose family : the Nor...
31566,3746,-lcb- -lcb- Hatnote -rcb- -rcb- in Los Angeles...
31793,3837,"in the American Old West of 1874 , constructio..."


In [19]:
# Make a new dataframe with the movie ID and the plot
summaries['id'] = summaries['id'].astype(int)
# Sort the dataframe by movie ID
summaries = summaries.sort_values(by=['id'])
# Show the first 5 rows
summaries.head()

Unnamed: 0,id,plot
2100,330,In order to prepare the role of an important o...
6038,3217,"After being pulled through a time portal, Ash ..."
20555,3333,The film follows two juxtaposed families: the...
39373,3746,"{{Hatnote}} In Los Angeles, November 2019, ret..."
13609,3837,"In the American Old West of 1874, construction..."


In [20]:
# Set to True to save the data
SEPARATE_SENTENCES = False # Takes ~1 min 20 to run (on i7-10875H CPU)

if SEPARATE_SENTENCES:
    # Imports
    count = 0
    count_ignored = 0
    dico_male = {}
    dico_female = {}
    dico_both = {}
    regexp = nltk.tokenize.RegexpTokenizer('\w+')

    # Loop on subgroups
    for _, group in characters_per_film:
        # Get the movie id
        movie_id = group['Wikipedia_movie_ID'].iloc[0]
        female_sentences = []
        male_sentences = []
        both_sentences = []

        # Check if wikipedia movie id is in the summaries
        if (movie_id in df_lem['id'].values) and (movie_id in summaries['id'].values):
            index_lem = df_lem[df_lem['id'] == movie_id].index[0] # Take the correct index
            index_real = summaries[summaries['id'] == movie_id].index[0] # Take the correct index
            plot_lem = df_lem['plot_lemmatized'][index_lem] # Take the correct plot
            plot_real = summaries['plot'][index_real] # Take the correct plot
            sentences_lem = plot_lem.split('.') # Split into sentences
            sentences_real = plot_real.split('.') # Split into sentences

            if len(sentences_lem) != len(sentences_real):
                count_ignored += 1
                continue

            # Loop on sentences
            for sentence_i in range(len(sentences_lem)):
                # tokens = regexp.tokenize(sentences_lem[sentence_i])
                # Loop on characters
                sentence = sentences_lem[sentence_i]
                for character in group['Character_name']:
                    if character in sentences_lem[sentence_i]:     
                        # Find the sex of the character
                        gender = group[group['Character_name'] == character].Actor_gender.values[0]
                        # Replace Character name by pronoun
                        if gender == 'M':
                            sentence = sentence.replace(character, 'he')
                        elif gender == 'F':
                            sentence = sentence.replace(character, 'she')

                tokens = regexp.tokenize(sentence)

                # Find potential pronouns discriminative on gender
                he_index = False
                she_index = False
                for token in tokens:
                    if token == 'he':
                        he_index = True
                    elif token == 'she':
                        she_index = True

                # Check where to append the sentence
                if (she_index and he_index):
                    both_sentences.append(sentences_real[sentence_i])

                elif he_index:
                    male_sentences.append(sentences_real[sentence_i])

                elif she_index:
                    female_sentences.append(sentences_real[sentence_i])


        # Store in dictionary and increment counter
        if len(male_sentences) > 0:
            dico_male[movie_id] = male_sentences
        if len(female_sentences) > 0:
            dico_female[movie_id] = female_sentences
        if len(both_sentences) > 0:
            dico_both[movie_id] = both_sentences
        count += 1

        # Evolution of the process
        if count%1000 == 0:
            print('{processed} files processed'.format(processed=count))

    print('Ignored {count} files'.format(count=count_ignored))

    # Pickle the file
    with open(DATA_FOLDER + 'male_sentences.pkl', 'wb') as file:
        pickle.dump(dico_male, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'female_sentences.pkl', 'wb') as file:
        pickle.dump(dico_female, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'both_sentences.pkl', 'wb') as file:
        pickle.dump(dico_both, file, protocol=pickle.HIGHEST_PROTOCOL)   

In [24]:
# Set to True to save the data
SEPARATE_SENTENCES_LEMMATIZE = False # Takes ~1 min 05 to run (on i7-10875H CPU)

if SEPARATE_SENTENCES_LEMMATIZE:
    # Imports
    count = 0
    dico_male = {}
    dico_female = {}
    dico_both = {}
    regexp = nltk.tokenize.RegexpTokenizer('\w+')

    # Loop on subgroups
    for _, group in characters_per_film:
        # Get the movie id
        movie_id = group['Wikipedia_movie_ID'].iloc[0]
        female_sentences = []
        male_sentences = []
        both_sentences = []

        # Check if wikipedia movie id is in the summaries
        if (movie_id in df_lem['id'].values):
            index_lem = df_lem[df_lem['id'] == movie_id].index[0] # Take the correct index
            plot_lem = df_lem['plot_lemmatized'][index_lem] # Take the correct plot
            sentences_lem = plot_lem.split('.') # Split into sentences

            # Loop on sentences
            for sentence in sentences_lem:
                # Loop on characters
                for character in group['Character_name']:
                    if character in sentence:     
                        # Find the sex of the character
                        gender = group[group['Character_name'] == character].Actor_gender.values[0]
                        # Replace Character name by pronoun
                        if gender == 'M':
                            sentence = sentence.replace(character, 'he')
                        elif gender == 'F':
                            sentence = sentence.replace(character, 'she')

                tokens = regexp.tokenize(sentence)

                # Find potential pronouns discriminative on gender
                he_index = False
                she_index = False
                for token in tokens:
                    if token == 'he':
                        he_index = True
                    elif token == 'she':
                        she_index = True

                # Check where to append the sentence
                if (she_index and he_index):
                    both_sentences.append(sentence)

                elif he_index:
                    male_sentences.append(sentence)

                elif she_index:
                    female_sentences.append(sentence)


        # Store in dictionary and increment counter
        if len(male_sentences) > 0:
            dico_male[movie_id] = male_sentences
        if len(female_sentences) > 0:
            dico_female[movie_id] = female_sentences
        if len(both_sentences) > 0:
            dico_both[movie_id] = both_sentences
        count += 1

        # Evolution of the process
        if count%1000 == 0:
            print('{processed} files processed'.format(processed=count))

    # Pickle the file
    with open(DATA_FOLDER + 'male_sentences_lem.pkl', 'wb') as file:
        pickle.dump(dico_male, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'female_sentences_lem.pkl', 'wb') as file:
        pickle.dump(dico_female, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'both_sentences_lem.pkl', 'wb') as file:
        pickle.dump(dico_both, file, protocol=pickle.HIGHEST_PROTOCOL)   

In [35]:
# Set to True to save the data
SEPARATE_SENTENCES_LEMMATIZE_GENRES = True # Takes ~1 min 05 to run (on i7-10875H CPU)

if SEPARATE_SENTENCES_LEMMATIZE_GENRES:
    # Imports
    count = 0
    regexp = nltk.tokenize.RegexpTokenizer('\w+')

    for genre in genres.index:
        movies_id[genre]

        # Check if movies_id[genre] is in the character_per_film groupby
        characters_genre = characters.copy()
        # Put the column in their correct type and lower chars
        characters_genre['Wikipedia_movie_ID'] = characters_genre['Wikipedia_movie_ID'].astype(int)
        characters_genre['Character_name'] = characters_genre['Character_name'].astype(str).apply(lambda x: x.lower())
        # Sort the dataframe by movie ID
        characters_genre = characters_genre.sort_values(by=['Wikipedia_movie_ID'])
        # Drio rows where the character name or the gender is empty
        characters_genre = characters_genre.dropna(subset=['Character_name', 'Actor_gender'])
        # Group the dataframe by movie ID
        characters_genre = characters_genre[characters_genre['Wikipedia_movie_ID'].isin(movies_id[genre])]

        characters_genre = characters_genre.groupby('Wikipedia_movie_ID')[['Wikipedia_movie_ID', 'Character_name', 'Actor_gender']]

        dico_male = {}
        dico_female = {}
        dico_both = {}

        # Loop on subgroups
        for _, group in characters_genre:
            # Get the movie id
            movie_id = group['Wikipedia_movie_ID'].iloc[0]
            female_sentences = []
            male_sentences = []
            both_sentences = []

            # Check if wikipedia movie id is in the summaries
            if (movie_id in df_lem['id'].values):
                index_lem = df_lem[df_lem['id'] == movie_id].index[0] # Take the correct index
                plot_lem = df_lem['plot_lemmatized'][index_lem] # Take the correct plot
                sentences_lem = plot_lem.split('.') # Split into sentences

                # Loop on sentences
                for sentence in sentences_lem:
                    # Loop on characters
                    for character in group['Character_name']:
                        if character in sentence:     
                            # Find the sex of the character
                            gender = group[group['Character_name'] == character].Actor_gender.values[0]
                            # Replace Character name by pronoun
                            if gender == 'M':
                                sentence = sentence.replace(character, 'he')
                            elif gender == 'F':
                                sentence = sentence.replace(character, 'she')

                    tokens = regexp.tokenize(sentence)

                    # Find potential pronouns discriminative on gender
                    he_index = False
                    she_index = False
                    for token in tokens:
                        if token == 'he':
                            he_index = True
                        elif token == 'she':
                            she_index = True

                    # Check where to append the sentence
                    if (she_index and he_index):
                        both_sentences.append(sentence)

                    elif he_index:
                        male_sentences.append(sentence)

                    elif she_index:
                        female_sentences.append(sentence)


            # Store in dictionary and increment counter
            if len(male_sentences) > 0:
                dico_male[movie_id] = male_sentences
            if len(female_sentences) > 0:
                dico_female[movie_id] = female_sentences
            if len(both_sentences) > 0:
                dico_both[movie_id] = both_sentences
            count += 1

            # Evolution of the process
            if count%1000 == 0:
                print('{processed} files processed'.format(processed=count))

        # Pickle the file
        with open(DATA_FOLDER + genre+'_male_sentences_lem.pkl', 'wb') as file:
            pickle.dump(dico_male, file, protocol=pickle.HIGHEST_PROTOCOL)    
        with open(DATA_FOLDER + genre+'_female_sentences_lem.pkl', 'wb') as file:
            pickle.dump(dico_female, file, protocol=pickle.HIGHEST_PROTOCOL)    
        with open(DATA_FOLDER + genre+'_both_sentences_lem.pkl', 'wb') as file:
            pickle.dump(dico_both, file, protocol=pickle.HIGHEST_PROTOCOL)   

1000 files processed
2000 files processed
3000 files processed
4000 files processed
5000 files processed
6000 files processed
7000 files processed
8000 files processed
9000 files processed
10000 files processed
11000 files processed
12000 files processed
13000 files processed
14000 files processed
15000 files processed
16000 files processed
17000 files processed
18000 files processed
19000 files processed
20000 files processed
21000 files processed
22000 files processed
23000 files processed
24000 files processed
25000 files processed
26000 files processed
27000 files processed
28000 files processed
29000 files processed
30000 files processed
31000 files processed
32000 files processed
33000 files processed
34000 files processed
35000 files processed
36000 files processed
37000 files processed
38000 files processed
39000 files processed
40000 files processed
41000 files processed
42000 files processed
43000 files processed
44000 files processed
45000 files processed
46000 files process

## Analyse sentiments for each group

We run it in the handling of data since it takes a long time to calculate

In [25]:
# Import male sentences
male_sentences_dict = pd.read_pickle(DATA_FOLDER + 'male_sentences.pkl')
# Form a dataframe
male_sentences = pd.DataFrame(list(male_sentences_dict.items()), columns = ['id','sentences'])
# Create a new column that reconstructs the summary from the lemmatized sentences
male_sentences['summary'] = male_sentences['sentences'].apply(lambda x: ' '.join(x))

# Import female sentences
female_sentences_dict = pd.read_pickle(DATA_FOLDER + 'female_sentences.pkl')
# Form a dataframe
female_sentences = pd.DataFrame(list(female_sentences_dict.items()), columns = ['id','sentences'])
# Create a new column that reconstructs the summary from the lemmatized sentences
female_sentences['summary'] = female_sentences['sentences'].apply(lambda x: ' '.join(x))

# Import both sentences
both_sentences_dict = pd.read_pickle(DATA_FOLDER + 'both_sentences.pkl')
# Form a dataframe
both_sentences = pd.DataFrame(list(both_sentences_dict.items()), columns = ['id','sentences'])
# Create a new column that reconstructs the summary from the lemmatized sentences
both_sentences['summary'] = both_sentences['sentences'].apply(lambda x: ' '.join(x))

# Show the first 5 rows of male sentences
female_sentences.head(10)

Unnamed: 0,id,sentences,summary
0,330,[In order to prepare the role of an important ...,In order to prepare the role of an important o...
1,3217,"[ However, Sheila is captured by a Flying Dead...","However, Sheila is captured by a Flying Deadi..."
2,3333,"[ Elsie takes Cameron's mother, who has travel...","Elsie takes Cameron's mother, who has travele..."
3,3746,[ Sent to the Tyrell Corporation to ensure tha...,Sent to the Tyrell Corporation to ensure that...
4,3947,"[ Increasingly curious, Jeffrey enters Dorothy...","Increasingly curious, Jeffrey enters Dorothy'..."
5,4227,"[ The widow , disdaining offers of marriage, d...","The widow , disdaining offers of marriage, de..."
6,4231,[Buffy Summers is introduced as a stereotypic...,Buffy Summers is introduced as a stereotypica...
7,4560,[ When an English soldier tries to rape Murron...,"When an English soldier tries to rape Murron,..."
8,4726,[ Vicki and Knox attend a benefit at Wayne Man...,Vicki and Knox attend a benefit at Wayne Mano...
9,4728,"[ Selina survives the fall, but it causes a ps...","Selina survives the fall, but it causes a psy..."


In [26]:
male_sentences.head(10)

Unnamed: 0,id,sentences,summary
0,3217,"[After being pulled through a time portal, Ash...","After being pulled through a time portal, Ash ..."
1,3333,[ The film follows two juxtaposed families: th...,The film follows two juxtaposed families: the...
2,3746,"[{{Hatnote}} In Los Angeles, November 2019, re...","{{Hatnote}} In Los Angeles, November 2019, ret..."
3,3837,"[In the American Old West of 1874, constructio...","In the American Old West of 1874, construction..."
4,3947,[Jeffrey Beaumont returns to his logging home...,Jeffrey Beaumont returns to his logging home ...
5,4227,[ :By What Means Redmond Barry Acquired the St...,:By What Means Redmond Barry Acquired the Sty...
6,4231,[ Benny is turned but Oliver is saved by Merri...,Benny is turned but Oliver is saved by Merric...
7,4560,"[In the 13th century, after several years of p...","In the 13th century, after several years of po..."
8,4726,"[As a child, Bruce Wayne witnesses his parent...","As a child, Bruce Wayne witnesses his parents..."
9,4727,[When Batman and Robin get a tip that Commod...,When Batman and Robin get a tip that Commodo...


In [63]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
SAVE_SENTIMENTS = True # Takes ~1 min to run (on i7-10875H CPU)

if SAVE_SENTIMENTS:
    # Use nltk Vader to get the sentiment of the sentences
    
    analyzer =  SentimentIntensityAnalyzer()

    # Apply sentiments to plots
    male_sentences['polarity'] = male_sentences['summary'].apply(lambda x: analyzer.polarity_scores(x))
    female_sentences['polarity'] = female_sentences['summary'].apply(lambda x: analyzer.polarity_scores(x))
    both_sentences['polarity'] = both_sentences['summary'].apply(lambda x: analyzer.polarity_scores(x))

    # Pickle the file
    with open(DATA_FOLDER + 'male_sentiments.pkl', 'wb') as file:
        pickle.dump(male_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)    
    with open(DATA_FOLDER + 'female_sentiments.pkl', 'wb') as file:
        pickle.dump(female_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)   
    with open(DATA_FOLDER + 'both_sentiments.pkl', 'wb') as file:
        pickle.dump(both_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)    

# Enriching the CMU dataset with IMDb dataset

## Loading the data and first glimpse

In [None]:
#Load the most useful datasets for the moment
TITLE_BASICS_DATASET = DATA_FOLDER + 'title.basics.tsv.gz'
TITLE_RATINGS_DATASET = DATA_FOLDER + 'title.ratings.tsv.gz'

columns_title_basics = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
columns_ratings = ['tconstIdentifier', 'averageRating', 'numVotes']

CLEAN_DATA = False # True to clean again the data, False to use the already pickled data

In [None]:
if CLEAN_DATA:
    #Load title_basics
    title_basics = load_metadata(TITLE_BASICS_DATASET, column_names=columns_title_basics)
    print("length of title_basics: ", len(title_basics))
    title_basics.head()

In [None]:
if CLEAN_DATA:
    #Load title_ratings
    ratings = load_metadata(TITLE_RATINGS_DATASET, column_names=columns_ratings)
    print("length of ratings: ", len(ratings))
    ratings.head()

## Cleaning the dataset

In [None]:
if CLEAN_DATA:
    #Create a new table with only titleType=movies (get rid of videos, tvshows, tvepisodes and short)
    title_basics_movies = title_basics[title_basics["titleType"] == "movie"]
    #Remove the endYear column since movies are not concerned by thats
    title_basics_movies_cleaned = title_basics_movies.drop(columns='endYear')
    title_basics_movies_cleaned.replace('\\N',np.NaN,inplace=True) # replace \N by NaN
    # datetime format for dates
    title_basics_movies_cleaned.startYear = pd.to_datetime(title_basics_movies_cleaned.startYear,format='%Y').dt.year 
    title_basics_movies_cleaned.head()

## Saving the cleaned dataset

In [None]:
if CLEAN_DATA:
    #Pickle the data
    to_pickle_data = title_basics_movies_cleaned
    to_pickle_name = 'IMDb_title_movies'
    to_pickle_data.to_pickle(DESTINATION+to_pickle_name+EXT)

if not CLEAN_DATA: # for testing part
    # load already pickled data
    title_basics_movies_cleaned = pd.read_pickle("./Data/IMDb_title_movies.pkl")
    title_basics_movies_cleaned.startYear = pd.to_datetime(title_basics_movies_cleaned.startYear,format='%Y').dt.year

## Merging IMDb and CMU datasets

We match the movies from one dataset to the films on the other dataset on the movie name, as the ids are different.

In order to avoid mismatched pairs due to a little variation in the titles, we matched films of the same year, with almost identical titles. We create a dictionnary that matches the index of matched films.

In [None]:
copy_IMDb = title_basics_movies_cleaned.copy()
copy_IMDb = copy_IMDb[copy_IMDb.startYear >= 1910]
copy_CMU = movies.copy()
copy_CMU.dropna(subset=['Movie_box_office_revenue', 'Movie_release_date'], inplace=True)
copy_IMDb.dropna(subset= ['startYear'], inplace=True)

In [None]:
print(len(copy_CMU))
print(len(copy_IMDb))

In [None]:
import re
common_words = {'a','an','and','the','of','at','in'}
punctuation = {'.',',','!',';','?',''}
def compare(df1,df2,col1_title,col2_title,col1_year,col2_year,threshold = 0.8, delta_year=1):
    matched = {}
    count = 0
    for idx1,row1 in df1.iterrows():
        title1 = set(re.split('[ :,]',row1[col1_title].lower()))
        title1 = title1.difference(punctuation)
        y1 = row1[col1_year]
        #for idx2,row2 in df2[df2[col2_year].isin([y1-delta_year+i for i in range(delta_year*2)])].iterrows():
        for idx2,row2 in df2[df2[col2_year]==y1].iterrows():
            title2 = set(re.split('[ :,]',row2[col2_title].lower()))
            title2 = title2.difference(punctuation)
            if len(title1 & title2)/(len(title1 | title2)) > threshold:
                try:
                    matched[idx1].append(idx2)
                except KeyError:
                    matched[idx1] = [idx2]
        count += 1
        if count == 100: # remove for the whole computation
            break
    return matched

In [None]:
from time import time
deb = time()
matched = compare(copy_CMU,copy_IMDb, 'Movie_name', 'primaryTitle', 'Movie_release_date', 'startYear')
end = time()
print('Time of execution:', end-deb)
matched

In [None]:
# save the matching table
with open(DATA_FOLDER + 'matching_table.pkl', 'wb') as file:
        pickle.dump(matched, file, protocol=pickle.HIGHEST_PROTOCOL)