In [48]:
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
 
from tqdm import tqdm
import re
import string
from itertools import combinations
from collections import Counter
 
 
from flair.models import SequenceTagger
from flair.data import Sentence

from fuzzywuzzy import fuzz
from collections import defaultdict
from fuzzywuzzy import process

[nltk_data] Downloading package stopwords to /home/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/julian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
data_folder = './Data/'
character_metadata = pd.read_csv(data_folder+'character.metadata.tsv', header=None, sep='\t', names=['wikipedia_ID','freebase_ID','release_date', 'character_name', 'actor_birthday', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name' , 'actor_age','freebase_character_actor_mapID', 'freebase_character_ID', 'freebase_actor_ID' ], parse_dates=False)
df = pd.read_csv(data_folder+'us_movies',sep=',')
df = df[df['plot_summary'].notna()]
df = df[0:10]
print(len(df))
df.head()

10


Unnamed: 0,wikipedia_ID,freebase_ID,name,release_date,box_office_revenue,runtime,languages,countries,genre,plot_summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th..."
5,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...","Adam, a San Francisco-based artist who works a..."
6,77856,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",The film opens with Mary Poppins perched in a...
11,156558,/m/014k4y,Baby Boy,2001-06-27,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']",A young 20-year-old named Jody lives with his...
13,9548445,/m/02pjlrp,Archie: To Riverdale and Back Again,1990-05-06,,100.0,['English Language'],['United States of America'],['Comedy'],"Archie Andrews, fifteen years after graduating..."


Optimized version of Aveek's code

In [101]:
tagger = SequenceTagger.load('ner')

def extract_main_characters(summary: str, nb_sentences=5):

    # Extracting and tagging first nb_sentences from summary
    sentences = sent_tokenize(summary)
    tagged_sentences = [Sentence(sent) for sent in sentences[:nb_sentences]]
    tagger.predict(tagged_sentences)

    # Extracting all names from the tagged sentences
    entities = [entity for sent in tagged_sentences for entity in sent.to_dict(tag_type='ner')['entities']]
    names = [entity['text'] for entity in entities if entity['labels'][0]['value'] == 'PER']

    # Removing punctuation
    names = [name.translate(str.maketrans('', '', string.punctuation)) for name in names]

    names_numbered = Counter(names).most_common()

    characters = defaultdict(int)

    for name, count in names_numbered:
        found = False
        standardized_name = name.lower()

        # Adding up number of counts if over 50% match
        for existing_name in characters:
            if fuzz.ratio(standardized_name, existing_name) > 50:
                characters[existing_name] += count
                found = True
                break

        # Adding name to character list if unique
        if not found:
            characters[standardized_name] += count

    # Converting from dictionary to ordered list
    ordered_characters = sorted(characters.items(), key=lambda x: x[1], reverse=True)
    
    ordered_characters = Counter(characters).most_common()
    main_characters = [name for name, count in ordered_characters[:3]]
    
    return main_characters

2023-11-16 09:15:01,628 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [102]:
def find_main_characters_genders(movie_row, characters_df):
    movie_ID = movie_row['wikipedia_ID']
    selected_character_metadata = characters_df.loc[characters_df['wikipedia_ID'] == movie_ID]

    genders = []
    for name in movie_row['main characters']:
        confidence = 0
        if selected_character_metadata['character_name'].any():
            closest_character, confidence, score = process.extractOne(name, selected_character_metadata['character_name'])

        if confidence > 50:
            gender = selected_character_metadata.loc[selected_character_metadata['character_name'] == closest_character, 'actor_gender'].values[0]
            genders.append(gender)

    return genders


In [103]:
df['main characters'] = df['plot_summary'].apply(extract_main_characters, nb_sentences=5)
df['main character genders'] = df.apply(find_main_characters_genders, characters_df=character_metadata, axis=1)
df.head()

Unnamed: 0,wikipedia_ID,freebase_ID,name,release_date,box_office_revenue,runtime,languages,countries,genre,plot_summary,characters,main characters,main character genders
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...","[(melanie ballard, 2), (desolation williams, 2...","[melanie ballard, desolation williams, jericho]","[F, M, M]"
5,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...","Adam, a San Francisco-based artist who works a...","[(adam, 5), (kate, 4), (nina, 3), (kevin, 2), ...","[adam, kate, nina]",[F]
6,77856,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",The film opens with Mary Poppins perched in a...,"[(edward, 2), (mary poppins, 1), (george banks...","[edward, mary poppins, george banks]","[F, M]"
11,156558,/m/014k4y,Baby Boy,2001-06-27,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']",A young 20-year-old named Jody lives with his...,"[(jody, 2), (yvette, 2), (juanita, 1), (melvin...","[jody, yvette, juanita]","[M, F, F]"
13,9548445,/m/02pjlrp,Archie: To Riverdale and Back Again,1990-05-06,,100.0,['English Language'],['United States of America'],['Comedy'],"Archie Andrews, fifteen years after graduating...","[(archie andrews, 2), (betty, 2), (pam, 1), (v...","[archie andrews, betty, pam]",[]
