In [4]:
import pandas as pd
import numpy as np
import ast
import missingno as msno
from geopy.geocoders import Nominatim
import geopandas as gpd
import re
import pycountry_convert as pc
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

import dataframes as RAW

In [2]:
from flair.nn import Classifier
from flair.data import Sentence

# Load the model
tagger = Classifier.load('ner-fast')

2023-12-06 11:32:50,993 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [121]:
def extract_character_names_flair(summary):
    # Create a Flair Sentence
    sentence = Sentence(summary)

    # Run NER on the sentence
    tagger.predict(sentence)

    # Extract character names (NER tags labeled as PER, indicating a person)
    character_names = []

    for entity in sentence.get_spans('ner'):
        if entity.tag == 'PER':
            character_names.append(entity.text)

    return character_names

def count_appearances(larger_string, string_list):
    # Initialize an empty dictionary to store counts
    appearances_dict = {}

    # Iterate over each string in the list
    for search_string in string_list:
        # Count occurrences using the count() method
        count = larger_string.lower().count(search_string.lower())
        
        # Store the count in the dictionary
        appearances_dict[search_string] = count

    return appearances_dict

In [149]:
summaries = RAW.summaries.copy()
summaries.head()

Unnamed: 0,Wiki ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [190]:
characters = RAW.character_data.copy()
characters['Role'] = np.nan
characters.head()

Unnamed: 0,Wiki ID,Freebase ID,Release date,Character name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor name,Actor age at release,Map ID,Character ID,Actor ID,Role
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,


In [151]:
sub_summaries = summaries.iloc[:100, :].copy()

parsing_results = []

for index, row in sub_summaries.iterrows():
    names = set(extract_character_names_flair(row['Summary']))
    counts = count_appearances(row['Summary'], names)

    parsing_results.append(counts)

In [152]:
parsing_results = [
    {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    for d in parsing_results
]

sub_summaries['Characters'] = parsing_results

In [154]:
sub_summaries.head()

Unnamed: 0,Wiki ID,Summary,Characters
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...","{'Lyosha': 1, 'Shlykov': 1}"
1,31186339,The nation of Panem consists of a wealthy Capi...,"{'Katniss': 24, 'Peeta': 16, 'Rue': 11, 'Crane..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,"{'Induchoodan': 18, 'Menon': 12, 'Manapally': ..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...","{'Kid': 35, 'Charley': 18, 'Moran': 8, 'Nellie..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,"{'Lindy': 7, 'Michael': 4, 'Azaria': 4, 'Chamb..."


In [114]:
def check_sum_percentage(dictionary, target_key, thresh):
    # Check if the target_key exists in the dictionary
    if target_key not in dictionary:
        raise KeyError(f"The key '{target_key}' does not exist in the dictionary.")

    # Get the values before and including the target_key
    values_before_target = [dictionary[key] for key in dictionary.keys() if key <= target_key]

    # Calculate the total sum of values and the sum before and including the target_key
    total_sum = sum(dictionary.values())
    sum_before_target = sum(values_before_target)

    # Check if the sum before and including the target_key is over thresh% of the total sum
    return sum_before_target < thresh * total_sum

In [191]:
for index, row in sub_summaries.iterrows():
    # Wiki ID of the movie to consider
    wiki_id = row['Wiki ID']

    # Dictionary of the parsing results for this movie
    parsing_result = row['Characters']

    # All characters who belong to this movie
    sub_characters = characters[characters['Wiki ID'] == wiki_id]
    
    # If the movie features actors inside of the character dataframe then proceed
    if not(sub_characters.empty):
        for i, r in sub_characters.iterrows():
            # Take one of the characters
            character = r['Character name']

            # If the considered character has a valid name then proceed
            if not(pd.isna(character)):
                # Split the character in all of its words (name, surname, etc)
                split_character_name = character.split()

                count = 0
                total = 0

                for key, value in parsing_result.items():
                    # Add all values to the total
                    total += value

                    for item in split_character_name:
                        if item in key:
                            # If we find a match then add to the count and stop (to avoid counting twice)
                            count += value
                            break
                    
                if total != 0:
                    # Compute ratio
                    ratio = count / total
                else:
                    # Empty dictionary: the character is a miss
                    ratio = 0

                if ratio > 0.1:
                    # Primary character: appears 10% of the time or more
                    characters.loc[(characters['Character name'] == character) & (characters['Wiki ID'] == wiki_id), 'Role'] = 'Primary'

                elif ratio <= 0.1 and ratio > 0:
                    # Secondary character: appears less than 10%
                    characters.loc[(characters['Character name'] == character) & (characters['Wiki ID'] == wiki_id), 'Role'] = 'Secondary'

                else:
                    # None: The character was not mentioned in the summary
                    characters.loc[(characters['Character name'] == character) & (characters['Wiki ID'] == wiki_id), 'Role'] = 'Missed'

In [196]:
characters[characters['Role'] == 'Primary']

Unnamed: 0,Wiki ID,Freebase ID,Release date,Character name,Actor DOB,Actor gender,Actor height,Actor ethnicity,Actor name,Actor age at release,Map ID,Character ID,Actor ID,Role
4633,20663735,/m/051zjwb,2000,M.K. Menon,1935-12-10,M,,/m/0dryh9k,Thilakan,64.0,/m/059t6pp,/m/0h73lnb,/m/02hkvw,Primary
4639,20663735,/m/051zjwb,2000,Marancheri Induchoodan,1960-05-21,M,1.72,/m/0dryh9k,Mohanlal,39.0,/m/059t6p_,/m/0h8gtfl,/m/02fbpz,Primary
6869,8153846,/m/026tk34,1994-03-09,Serge Karamasov,1958-11-24,M,,,Alain Chabat,35.0,/m/02nzwsh,/m/02nzwsk,/m/0krrfy,Primary
6870,8153846,/m/026tk34,1994-03-09,Le commissaire Bialès,1948-02-29,M,,,Gérard Darmon,46.0,/m/02nzwvq,/m/02nzwvs,/m/03gsbhw,Primary
6871,8153846,/m/026tk34,1994-03-09,Odile Deray,1957-03-23,F,,,Chantal Lauby,36.0,/m/02nzww7,/m/02nzww9,/m/02nzwwk,Primary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434586,32163620,/m/09p7hch,1946-12-03,Jean Diego,1921-10-13,M,1.87,,Yves Montand,25.0,/m/0hyqdrz,/m/0hyqdqh,/m/0j976,Primary
434588,32163620,/m/09p7hch,1946-12-03,Raymond Lécuyer,1907-11-03,M,,,Raymond Bussières,39.0,/m/0hyqdsb,/m/0hyqdqs,/m/04glxnt,Primary
435536,15072401,/m/03hglxv,1936-01-17,Henrietta Lowell,1861-09-02,F,,,Henrietta Crosman,,/m/0cgp3br,/m/0n4kk8p,/m/0b6f572,Primary
435538,15072401,/m/03hglxv,1936-01-17,Alice Lowell,1912-12-30,F,1.60,,Rosina Lawrence,23.0,/m/0n4kkcz,/m/0n4kkd1,/m/05f28v,Primary


In [180]:
sub_summaries[sub_summaries['Wiki ID'] == 31186339]['Characters'].values

array([{'Katniss': 24, 'Peeta': 16, 'Rue': 11, 'Crane': 4, 'Haymitch': 4, 'Cato': 4, 'Clove': 3, 'Snow': 2, 'Thresh': 2, 'Marvel': 1, 'Haymitch Abernathy': 1, 'Caesar Flickerman': 1, 'Primrose Everdeen': 1, 'Peeta Mellark': 1, 'Seneca Crane': 1, 'Glimmer': 1}],
      dtype=object)