Analyse age gap between actors and actresses who play love interests in the big screen. 

1. Find pairs of characters who are in a relationship
2. Get age of actors and actresses when they played the given character

In [6]:
import sys
sys.path.append("../../")

import bechdelai.data.wikipedia as wiki
import spacy
import outputformat as ouf
import pandas as pd

import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

# 1. Find love interests 
Use plot text from Wikipedia to find characters who are in a relationship.

In [7]:
movies = ['Call Me by Your Name (film)', 'The Holiday','Fantastic Beasts: The Secrets of Dumbledore','Beauty and the Beast (2017 film)']
plots = {}
for movie in movies:
    sections = wiki.get_section_text(movie,['Plot'])
    plots[movie]= sections['Plot']
    

In [8]:
nlp = spacy.load("en_core_web_md")
for movie in plots.keys():
    doc = nlp(plots[movie])
    displacy.render(doc,style="ent")

In [9]:
def find_previous_sent(doc,sent):
    if sent.start==0:
        return ''
    for s in doc.sents: # check if it's possible to optimize
        if s.end==sent.start:
            return s
    print('No sentence was found')    

matcher = PhraseMatcher(nlp.vocab)
terms = ["have sex", "sleep together","in love","marry","love","kiss","date"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

for movie in plots.keys():
    ouf.boxtitle(movie)
    text = plots[movie]
    doc = nlp(text)
    matches = matcher(doc) # [lexeme, start token, end token]
    for match_id, start, end in matches:
        # create the matched span and assign the match_id as a label
        span = Span(doc, start, end, label=match_id)
        previous_span = find_previous_sent(doc,span.sent)
#         print(previous_span,span.sent)
        displacy.render(span.sent,"dep")

        who = []
        who_pos = []
        for ent in span.sent.ents:
            if ent.label_=="PERSON":
                who.append(ent.text)
        if len(who)<2:
            if len(previous_span):
                for ent in previous_span.ents:
                    if ent.label_=="PERSON":
                        who.append(ent.text)
                
        print(who)


╭─────────────────────────────╮
│ Call Me by Your Name (film) │
╰─────────────────────────────╯


['Elio', 'Oliver']


['Elio']


['Elio']


['Elio', 'Oliver', 'Elio']
╭─────────────╮
│ The Holiday │
╰─────────────╯


['Iris Simpkins', 'Jasper Bloom']


['Iris Simpkins', 'Jasper Bloom']


['Amanda', 'Graham']


['Arthur', 'Miles', 'Iris']
╭─────────────────────────────────────────────╮
│ Fantastic Beasts: The Secrets of Dumbledore │
╰─────────────────────────────────────────────╯


['Jacob', 'Queenie', 'Jacob', 'Tina Goldstein']
╭──────────────────────────────────╮
│ Beauty and the Beast (2017 film) │
╰──────────────────────────────────╯


[]


[]


['Belle', 'Maurice']


['Gaston']


['Gaston']


['Belle']


## Next steps
We are able to extract sentences that can help define the characters' relationship. Next step consists of extracting characters' names in a relationship from these sentences.

Challenges:
- Different structures as observed in all previous exemples
- Proper noums may be missing from sentence (ex: *Later, in a secluded spot, __the two__ kiss for the first time.*) -> consider previous sentence
- There might be other proper noums in the sentence from a character who in not in the relationship (ex: *__Jacob__ and __Queenie__ marry in the former's bakery in New York City, with most of the group and __Tina Goldstein__ in attendance.*) -> how to choose

# 2. Get age of actors and actresses 
Use TMDb data to relate each actor-ress to a character and to compute hir/her age at movie release date

In [None]:
# To get movies data from TMDb, we need to find the movie's ID.

from datetime import datetime
import bechdelai.data.tmdb as tmdb
from bechdelai.data.display import show_movie_suggestions_get_id

query = 'Beauty and the Beast'
data = tmdb.search_movie_from_query(query)
suggestions = tmdb.format_results_for_suggestion(data)
movie_id = show_movie_suggestions_get_id(suggestions, top=3, verbose=True)

In [6]:
def age(birthdate,release_date):
    # Difference in years 
    year_difference = release_date.year - birthdate.year
    
    # Check if birthday happened before or after release date
    one_or_zero = ((release_date.month, release_date.day) < (birthdate.month, birthdate.day))
        
    # If release before birthday -> substract 1
    # If release after birthday -> substract 0
    age = year_difference - one_or_zero
        
    return age

In [7]:
# get the cast data and relase date for a given movie
movies_tmdb_id = [398818,1581,338953,321612] # ids corresponding to movies
movie_id = movies_tmdb_id[1]

release_date = tmdb.get_movie_details_from_id(movie_id)['release_date']
release_date = datetime.strptime(release_date, '%Y-%m-%d')

data = tmdb.get_movie_cast_from_id(movie_id)
cast_df = pd.DataFrame(data["cast"])
cast_df = cast_df[cast_df['known_for_department']=='Acting'][['gender','id','name','character']]



In [8]:
# compute age at release date for each cast member
cast_df.insert(4,'age_at_release',None)
for i,row in cast_df.iterrows():
    birthday = tmdb.get_person_details_from_id(row['id'])['birthday']
    try:
        cast_df.loc[i,'age_at_release'] = age(datetime.strptime(birthday, '%Y-%m-%d'),release_date)
    except TypeError:
        cast_df.loc[i,'age_at_release'] = None
cast_df

Unnamed: 0,gender,id,name,character,age_at_release
0,1,204,Kate Winslet,Iris Simpkins,31.0
1,1,6941,Cameron Diaz,Amanda Woods,34.0
2,2,9642,Jude Law,Graham Simpkins,33.0
3,2,70851,Jack Black,Miles Dumont,37.0
4,2,3265,Eli Wallach,Arthur Abbott,91.0
5,2,12833,Edward Burns,Ethan,38.0
6,2,17328,Rufus Sewell,Jasper,39.0
7,1,17691,Miffy Englefield,Sophie,7.0
8,1,17692,Emma Pritchard,Olivia,
9,1,17693,Sarah Parish,Hannah,38.0
