### Important librairies

In [51]:
#Imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
from lxml import objectify
%matplotlib inline

# Analysis of the lexical fields
from empath import Empath 

data_folder = './data/'

### Import of the dataset


#### Movie metadata

In [52]:
### Movie metadata import
df_movie = pd.read_table(data_folder +'movie.metadata.tsv',header=None)
df_movie.columns=['Wikipedia_movie_ID' , 'Freebase_movie_ID', 'Movie_name' , 'Movie_date' , 'Movie_revenue' , 'Movie_runtime' , 'Movie_languages' , 'Movie_countries' , 'Movie_genres']
df_movie['Movie_countries'] = df_movie['Movie_countries'].apply( lambda x: list(eval(x).keys()))
df_movie['Movie_genres'] = df_movie['Movie_genres'].apply( lambda x: list(eval(x).values()) )


#Movie metadata of american movies
df_movie_usa = df_movie.loc[df_movie['Movie_countries'].explode().isin(['/m/09c7w0'])[df_movie['Movie_countries'].explode().isin(['/m/09c7w0'])].index]


#### Character metadata

In [53]:
### Character metadata import
df_character = pd.read_table(data_folder +'character.metadata.tsv',header=None)
df_character.columns=['Wikipedia_movie_ID' , 'Freebase_movie_ID' , 'Movie_date' , 'Character_name' , 'Actor_date_of_birth' , 'Actor_gender' , 'Actor_height' , 'Actor_ethnicity' , 'Actor_name' , 'Actor_age_at_movie_release' , 'Freebase_character_actor_ID' , 'Freebase_character_ID' , 'Freebase_actor_ID'  ]

#Correct negative or too high actor age
df_character['Actor_age_at_movie_release']=df_character['Actor_age_at_movie_release'].apply(lambda x: -x if x<0 else x)
df_character['Actor_age_at_movie_release']=df_character['Actor_age_at_movie_release'].apply(lambda x: float("nan") if x>130 else x)

#Character metadata of american movies
df_character_usa = df_character.merge(df_movie_usa['Freebase_movie_ID'], on='Freebase_movie_ID')
df_character_usa.to_csv('df_character_usa.csv')

#### Movie summary

In [54]:
#Dataframe with existing movie summary
wiki_id_list = []
for ctr,wiki_id in enumerate(list(df_movie_usa['Wikipedia_movie_ID'])) : 
    path = data_folder+'/corenlp_plot_summaries/'+str(wiki_id)+'.xml'
    if os.path.exists(path):
        wiki_id_list.append(wiki_id)
wiki_id_series = pd.Series(wiki_id_list, name='Wikipedia_movie_ID',dtype=object)

#Data frame (movie metadata) with only existing summaries 
df_movie_usa_summary=df_movie_usa.merge(wiki_id_series, on='Wikipedia_movie_ID')


#Data frame (character metadata) with only existing summaries
df_character_usa_summary=df_character_usa.merge(wiki_id_series, on='Wikipedia_movie_ID')


In [55]:
#Summary exploration
df=df_movie_usa_summary

#Initialization
lexicon = Empath()
df_lex_fields = pd.DataFrame()
dic_lex_fields=dict.fromkeys(df['Wikipedia_movie_ID'].values,[])

for ctr,movie in enumerate(list(df['Wikipedia_movie_ID'].values)) :

    path = data_folder+'/corenlp_plot_summaries/'+str(movie)+'.xml'
    if os.path.exists(path):
        #Extract dataframe from xml file and convert it into a dataframe
        df_summary=pd.DataFrame()
        df_summary = pd.read_xml(data_folder+'/corenlp_plot_summaries/'+str(movie)+'.xml' , xpath='//token',parser='lxml')
        df_summary.rename(columns={'id':'word_id'},inplace=True)
        df_summary.insert(0, "sentence_id",df_summary['word_id'].ne(df_summary['word_id'].shift()+1).cumsum(), True) #add sentence id by indexing sequences  
        
        ###Lexical field analysis
        summary_words=list(df_summary['word'].values) #all words of the summary in a list
        summary_words=[str(word) for word in summary_words] #convert all words to a string
        features_lex_fields = pd.Series([lexicon.analyze(list(summary_words), categories = ["feminine","sexist","sexiest","beauty","beautiful","positive_emotion","negative_emotion"])])
        #Store data into a dictionnary
        dic_lex_fields[movie]= features_lex_fields[0]
        #Add a column corresponding to the number of words in the summary
        dic_lex_fields[movie]['Number_of_words']=len(df_summary)
        
        
        
        ###Count of he/she        
        #Filter only the pronouns
        df_summary_pronouns= df_summary[df_summary.POS=='PRP']
        #Count the number of redundant words 
        personal_pronouns = df_summary_pronouns.groupby(['word'])['word'].count().sort_values(ascending=False)

        if 'he' in list(personal_pronouns.index):
            dic_lex_fields[movie]['he_count'] = personal_pronouns['he']
        else:
            dic_lex_fields[movie]['he_count'] = 0

        if 'she' in list(personal_pronouns.index):
            dic_lex_fields[movie]['she_count'] = personal_pronouns['she'] 
        else:
            dic_lex_fields[movie]['she_count'] = 0
        
        
        
        ###Principal characters according to the summary
        characters_name = df_summary[df_summary['NER'] == 'PERSON']['word'].value_counts().index
        if len(characters_name)>=2:
            dic_lex_fields[movie]['Principal_summary_character'] =  characters_name[0] 
            dic_lex_fields[movie]['Secondary_summary_character'] =  characters_name[1] 
        if len(characters_name)==1:
            dic_lex_fields[movie]['Principal_summary_character'] =  characters_name[0] 
            dic_lex_fields[movie]['Secondary_summary_character'] =  float('nan')    
        if len(characters_name)==0:
            dic_lex_fields[movie]['Principal_summary_character'] =  float('nan')
            dic_lex_fields[movie]['Secondary_summary_character'] =  float('nan')   

df_lex_fields = pd.DataFrame.from_dict(dic_lex_fields, orient='index')  
#df_lex_fields.insert(loc=0, column='Movie_name', value=df['Movie_name'].values)
df_lex_fields.insert(loc=1, column='Freebase_movie_ID', value=df['Freebase_movie_ID'].values)
df_usa_summary_processed=df_movie_usa_summary.merge(df_lex_fields, on='Freebase_movie_ID')
df_usa_summary_processed


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_date,Movie_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,feminine,...,sexiest,beauty,beautiful,positive_emotion,negative_emotion,Number_of_words,he_count,she_count,Principal_summary_character,Secondary_summary_character
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}",[/m/09c7w0],"[Thriller, Science Fiction, Horror, Adventure,...",1.0,...,0.0,0.0,0.0,0.0,7.0,396,1,0,Ballard,Williams
1,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}",[/m/09c7w0],"[Romantic comedy, Ensemble Film, Comedy-drama,...",3.0,...,0.0,0.0,0.0,2.0,2.0,257,2,5,Adam,Kate
2,11250635,/m/02r52hc,The Mechanical Monsters,,,,"{""/m/02h40lc"": ""English Language""}",[/m/09c7w0],"[Science Fiction, Adventure, Animation, Short ...",1.0,...,0.0,1.0,0.0,0.0,3.0,707,4,2,Lois,Clark
3,77856,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}",[/m/09c7w0],"[Children's/Family, Musical, Fantasy, Comedy, ...",1.0,...,0.0,1.0,0.0,8.0,5.0,1678,12,7,Mary,Banks
4,21926710,/m/05p45cv,White on Rice,2009,,82.0,{},[/m/09c7w0],"[Romantic comedy, Romance Film, Comedy, Indie]",1.0,...,0.0,1.0,0.0,2.0,0.0,125,3,0,Jimmy,Tak
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20784,7761830,/m/0kvgqb,Spaced Invaders,1990,15369573.0,100.0,"{""/m/02h40lc"": ""English Language""}",[/m/09c7w0],"[Alien Film, Science Fiction, Family Film, Com...",0.0,...,0.0,0.0,0.0,2.0,4.0,639,1,0,Bipto,Wrenchmuller
20785,26044505,/m/0b6m67n,The Flying Serpent,1946,,59.0,{},[/m/09c7w0],"[Thriller, B-movie, Horror]",0.0,...,0.0,0.0,0.0,0.0,3.0,37,1,0,Andrew,Forbes
20786,1918494,/m/0660qx,State and Main,2000-08-26,6944471.0,106.0,"{""/m/02bjrlw"": ""Italian Language"", ""/m/02h40lc...","[/m/0f8l9c, /m/09c7w0]","[Parody, Americana, Comedy]",1.0,...,0.0,0.0,0.0,2.0,1.0,252,1,1,Carla,Walt
20787,664006,/m/030xw6,Guilty as Sin,1993-06-04,22886222.0,107.0,{},[/m/09c7w0],"[Thriller, Erotic thriller, Psychological thri...",1.0,...,0.0,0.0,0.0,1.0,9.0,611,6,8,Greenhill,Haines


In [58]:
#df=df_movie_usa.merge(df_usa_summary_processed, on=['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_date','Movie_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres'], how='outer')
df=df_movie_usa.merge(df_usa_summary_processed, on=['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_date','Movie_revenue','Movie_runtime','Movie_languages'], how='outer')
df.drop('Movie_genres_y', inplace=True, axis=1)
df.drop('Movie_countries_y', inplace=True, axis=1)
df.rename(columns={"Movie_genres_x": "Movie_genres", "Movie_countries_x": "Movie_countries"},inplace=True)

#Export final dataframe for movie metadata
df.to_csv('df_movie_usa.csv')


In [59]:
df_movie_usa=pd.read_csv('df_movie_usa.csv')
df_character_usa=pd.read_csv('df_character_usa.csv')

## 1) Focus on the USA