In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
from itertools import chain
import warnings

import spacy
#spacy.cli.download("en_core_web_trf")

import gender_guesser.detector as gender
from allennlp.predictors.predictor import Predictor
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
predictor = Predictor.from_path(model_url)

data_folder = 'data/'

error loading _jsonnet (this is expected on Windows), treating C:\Users\alexi\AppData\Local\Temp\tmpvfdfaf83\config.json as plain json


In [7]:
#Load data and set indexes
movie_metadata = pd.read_csv(data_folder + "movie.metadata.tsv", sep='\t', header = None)
movie_metadata.columns = ['movie_ID','FB_ID','movie_name','release_date','box_office','length','languages','country','genres']

#Force float type to the movie ID column, set as index
movie_metadata.movie_ID = movie_metadata.movie_ID.astype(int)
movie_metadata = movie_metadata.set_index('movie_ID')

#Drop freebase database ID, we don't need it
movie_metadata = movie_metadata.drop(columns=['FB_ID'])

#Remove movies with non-defined release date
#Convert release date to datetime type, only keep year 
movie_metadata = movie_metadata[movie_metadata.release_date.notna()]
movie_metadata['release_date'] = movie_metadata.release_date.apply(lambda d:datetime.strptime(str(d[0:4]), "%Y"))
movie_metadata['release_date'] = [date.year for date in movie_metadata.release_date]

#Change release date 1010, it's a mistake movie actually release in 2010
movie_metadata = movie_metadata.replace(1010,2010)

#Clean languages, country, genres columns
movie_metadata['languages'] = [lang.split('"')[3::4] for lang in movie_metadata.languages]
movie_metadata['country'] = [lang.split('"')[3::4] for lang in movie_metadata.country]
movie_metadata['genres'] = [lang.split('"')[3::4] for lang in movie_metadata.genres]

movie_metadata.head(10)

Unnamed: 0_level_0,movie_name,release_date,box_office,length,languages,country,genres
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
975900,Ghosts of Mars,2001,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
28463795,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]"
9363483,White Of The Eye,1987,,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
261236,A Woman in Flames,1983,,106.0,[German Language],[Germany],[Drama]
13696889,The Gangsters,1913,,35.0,"[Silent film, English Language]",[United States of America],"[Short Film, Silent film, Indie, Black-and-whi..."
18998739,The Sorcerer's Apprentice,2002,,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]"
10408933,Alexander's Ragtime Band,1938,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]"
9997961,Contigo y aquí,1974,,,[Spanish Language],[Argentina],"[Musical, Drama, Comedy]"
2345652,City of the Dead,1960,,76.0,[English Language],[United Kingdom],"[Horror, Supernatural]"


In [8]:
#Load character metadata
character_metadata = pd.read_csv(data_folder + "character.metadata.tsv", sep='\t', header=None)

#only keep movie_ID , name, gender, and age
character_metadata = character_metadata.iloc[:,[0,2,3,5,9]] 
character_metadata.columns = ['movie_ID','release_date','name','gender','age']
character_metadata.name = character_metadata['name'].astype(str)
character_metadata.movie_ID = character_metadata['movie_ID'].astype(int)
character_metadata = character_metadata.set_index('movie_ID')

#Change realease year to float indicading the year
character_metadata = character_metadata[character_metadata['release_date'].notna()]
character_metadata['release_date'] = character_metadata['release_date'].apply(lambda x : datetime.strptime(str(x)[0:4],"%Y"))
character_metadata['release_date'] = [date.year for date in character_metadata['release_date']] 
character_metadata = character_metadata.replace(1010, 2010)

#Drop the characters without gender defined
character_metadata = character_metadata[character_metadata.gender.notna()]

character_metadata.sample(10)

Unnamed: 0_level_0,release_date,name,gender,age
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10996357,2005,,M,
28775066,1949,,M,
33548187,2012,Manoj,M,
24631539,1969,,F,37.0
23687637,1951,L'uomo del panino,M,41.0
9980412,2007,Tom Myers,M,62.0
20904216,1977,,M,
8119369,1947,,F,
10335449,2000,Frank,M,38.0
23687831,1954,Don Raffaele Scotto,M,


In [9]:
summary_file = open(data_folder + 'plot_summaries.txt', 'r', encoding="utf8")
summaries = summary_file.readlines()

#Separate summary and movie ID
summaries = pd.DataFrame([summ.split("\t") for summ in summaries], columns=['movie_ID','summary'])
summaries['summary'] = summaries.summary.replace(r'\n',' ', regex=True) #Remove /n at end of summaries
summaries['movie_ID'] = summaries['movie_ID'].astype(int)
summaries = summaries.set_index('movie_ID')

#Count number of words in each summary
def count_words_simple(x):
    return len(x.split(" "))

summaries['length'] = summaries['summary'].apply(count_words_simple)

summaries.head()

Unnamed: 0_level_0,summary,length
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",26
31186339,The nation of Panem consists of a wealthy Capi...,781
20663735,Poovalli Induchoodan is sentenced for six yea...,505
2231378,"The Lemon Drop Kid , a New York City swindler,...",854
595909,Seventh-day Adventist Church pastor Michael Ch...,398


In [10]:
def check_USA_produced(countries):
    return any('United States of America' in country for country in countries)

#Select only movies produced in the US
mask_usa = [check_USA_produced(x) for x in movie_metadata.country]
usa_movie = movie_metadata[mask_usa][:]

#Selecting only movies released after 1940
usa_movie.drop(usa_movie[usa_movie['release_date'] < 1940.0].index, inplace = True)

usa_movie.head()

Unnamed: 0_level_0,movie_name,release_date,box_office,length,languages,country,genres
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
975900,Ghosts of Mars,2001,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
6631279,Little city,1997,,93.0,[English Language],[United States of America],"[Romantic comedy, Ensemble Film, Comedy-drama,..."
77856,Mary Poppins,1964,102272727.0,139.0,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ..."
33420460,Keep the Change,1992,,97.0,[English Language],[United States of America],[Drama]


In [11]:
#Filter out summaries that are not in the set of movie of interest (USA + released after ...)
movie_interest = list(set(np.unique(summaries.index)).intersection(usa_movie.index.values))
summaries_USA = summaries.loc[movie_interest]

In [12]:
# select summaries having more 350 than 350 words
summaries_more_350words = summaries_USA[(summaries_USA['length'] >= 350)]

In [13]:
# Joining USA movie and summaries metadata
merged_summary = pd.merge(left = summaries_more_350words, right = usa_movie['release_date'], on = 'movie_ID', how = 'right')
#Drop movies with nan
merged_summary = merged_summary.dropna(subset = ['summary'])

In [14]:
from nlp_pipeline import * #just one function ??

gender_guess = gender.Detector(case_sensitive=False)

#List of words to remove from the name of the character 
words_remove = pd.read_csv(data_folder + 'remove_from_name.csv', encoding="utf8", header=None)
words_remove = list(words_remove.iloc[:,0])

#List of interesting words decribing charcaters
family_occupation = pd.read_csv(data_folder + 'family_occupation.csv', encoding="utf8", header=None)
family_occupation = list(family_occupation.iloc[:,0])

In [14]:
remove_idx=(character_metadata[character_metadata.groupby('movie_ID').count().release_date == 1].index.intersection(merged_summary.index))
#Create Serie with movie_id as index and output nlp pipeline 
idx = list(merged_summary.index.intersection(character_metadata.index)) 
#Remove problematic index
idx = [x for x in idx if x not in remove_idx]

idx = idx[6369:6373]   #CHANGE INDEX BEFORE RUNNING, index + 1 !!!
process_summaries =  pd.DataFrame({'movie_id': idx})

#Apply nlp pipeline 
with warnings.catch_warnings():
    warnings.simplefilter("ignore") #Allenlp use a deprecated version of a function of pytorch, therefore, we supress warnings here.
    process_summaries['nlp'] = process_summaries.movie_id.apply(lambda i: process_summary(summaries.loc[i].summary, character_metadata.loc[i][:], predictor, words_remove, family_occupation, gender_guess))

  remove_idx=(character_metadata[character_metadata.groupby('movie_ID').count().release_date == 1].index.intersection(merged_summary.index))


In [15]:
process_summaries.to_pickle("data/6369_6372.pkl") #CHANGE NAME before saving, 100_200 (value 200 is included)

In [None]:
#pd.read_pickle("data/500.pkl").nlp[0]     reads dataframe of first summary

In [184]:
summary_story = "Once upon a time, there lived a young prince called Flynn Rider. He was known by all to be kind, caring, and handsome. However, on one fateful day, his step-brother Cyrus consumed by jealousy decided to imprison him in a high tower far far away from the family kingdom. Research go on for weeks without success: the prince has vanished. Rapunzel, an ADAventurer from a small hamlet at the border of the kingdom, likes to explore the deep forests with her chameleon Pascal. On one of her wanders, she discovers the high tower where dear Flynn is kept captive. Sensing a human presence in the tower, Pascal rushed to the top where he encountered Flynn. The chameleon highly squeaked due to his surprise which worried Rapunzel who joined him at the top by using her long hair as a rope and saved Flynn. She then denounced Cyrus and his atrocious behaviour who is sentenced to jail."

In [185]:
df_story = pd.read_excel(data_folder + "story_data.xlsx")
df_story

Unnamed: 0,name,gender
0,Rapunzel,F
1,Flynn Rider,M
2,Pascal,M
3,Cyrus,M


In [495]:
df_result = process_summary(summary_story, df_story, predictor, words_remove, family_occupation, gender_guess)



"Once upon a time, there lived a young prince called Flynn Rider. He was known by all to be kind, caring, and handsome. However, on one fateful day, his step-brother Cyrus consumed by jealousy decided to imprison his brother in a high tower far far away from the family kingdom. Research go on for weeks without success: the prince has vanished. Rapunzel, an ADAventurer from a small hamlet at the border of the kingdom, likes to explore the deep forests with her chameleon Pascal. On one of her wanders, she discovers the high tower where dear Flynn is kept captive. Sensing a human presence in the tower, Pascal rushed to the top where he encountered Flynn. The chameleon highly squeaked due to his surprise which worried Rapunzel who joined him at the top by using her long hair as a rope and saved Flynn. She then denounced Cyrus and his atrocious behaviour who is sentenced to jail."

In [497]:
df_result.active = df_result.active.apply(convert_string_to_list)
df_result.passive = df_result.passive.apply(convert_string_to_list)

df_result = df_result.drop(['description'], axis = 1)
df_result.set_index('name')
df_result

Unnamed: 0,name,mention,gender,active,passive
0,flynn rider,7,M,"[live, vanish]","[know, imprison, keep, encounter, save]"
1,pascal,5,M,"[rush+prep, encounter, squeak+prep]","[explore, join, save]"
2,rapunzel,4,F,"[like, explore, discover, join, denounce]",[worry]
3,cyrus,2,M,"[decide, imprison]","[denounce, sentence]"


In [498]:
df_description_verbs = ['be', 'look', 'seem', 'sleep', 'appear', 'live']

def activeness_score (df_charac_NLP):  #df_charac_NLP has the same shape as df_Mia, only one row    
    
    iter_df = df_charac_NLP['active'].copy()
    
    for verb in df_charac_NLP.active:
        if verb in df_description_verbs:
            df_charac_NLP['passive'].append(verb)
            iter_df.remove(verb)
    
    df_charac_NLP['active'] = iter_df
    df_charac_NLP['activeness_score'] = len(df_charac_NLP.active) / (len(df_charac_NLP.active) + len(df_charac_NLP.passive))
    
    return df_charac_NLP

In [500]:
df_result = df_result.apply(activeness_score, axis = 1)
df_result

Unnamed: 0,name,mention,gender,active,passive,sum_,activeness_score
0,flynn rider,7,M,[vanish],"[know, imprison, keep, encounter, save, live]",8,0.142857
1,pascal,5,M,"[rush+prep, encounter, squeak+prep]","[explore, join, save]",6,0.5
2,rapunzel,4,F,"[like, explore, discover, join, denounce]",[worry],5,0.833333
3,cyrus,2,M,"[decide, imprison]","[denounce, sentence]",3,0.5
