In [None]:
# Some basic imports 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# and more advanced ones
import os, codecs
import spacy, nltk
from collections import Counter

In [None]:
# Load plot summaries
with codecs.open(os.path.join('data/raw/', 'plot_summaries.txt'),encoding="utf8") as file:
    content = file.read()

# Load movies metadata
DATA_PATH = 'data/clean/movies_char/'

import src.data.movies_char_data as MovieChar

In [None]:
movie_df = MovieChar.MovieData("Movie", "movie.metadata.tsv", output_name = "movie_data_clean.csv")
movie_df.clean_raw_data()
movie_df.pipeline()

Movie : loaded 81740 rows !
Movie : loaded 81740 rows !
Movie : Clean data has been and saved to data/clean/movies_char/movie_data_clean.csv! (81740 rows)


In [None]:
character_df = MovieChar.CharacterData("Character", "character.metadata.tsv", output_name = "character_data_clean.csv")
character_df.clean_raw_data()
character_df.pipeline()

Character : loaded 450668 rows !
Character : Cleaning the raw data
Character : loaded 450668 rows !
Character : Cleaning the raw data
Character : Clean data has been and saved to data/clean/movies_char/character_data_clean.csv! (171826 rows)


In [None]:
from src.utils.movies_utils import *
mov_char_data = merge_movies_characters_data(movie_df, character_df)

mov_char_data.head()

Duplicates found: 412 duplicates ! removing them...


Unnamed: 0,Wikipedia_movie_ID,Movie_name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Character_name,Actor_DOB,Actor_gender,Actor_height,Actor_name,Actor_age
0,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,0 days 01:35:00,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama",POLICE OFFICER,NaT,M,,ALLEN CUTLER,
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,0 days 01:35:00,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama",REPORTER,1956-12-19,F,,ALICE BARRETT,43.0
2,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,0 days 01:35:00,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama",FBI PROFILER ROBERT HANKS,1950-01-05,M,,ROBERT CATRINI,50.0
3,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,0 days 01:35:00,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama",JOHN RAMSEY,1945-02-12,M,1.85,CLIFF DEYOUNG,55.0
4,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,0 days 01:35:00,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama",PATSY RAMSEY,1964-07-12,F,1.63,JUDI EVANS,35.0


In [None]:
mov_char_data[mov_char_data['Movie_name'].str.contains('Narasimham')]

Unnamed: 0,Wikipedia_movie_ID,Movie_name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Character_name,Actor_DOB,Actor_gender,Actor_height,Actor_name,Actor_age
3403,20663735,Narasimham,NaT,,0 days 02:55:00,Malayalam,India,"Musical, Action, Drama, Bollywood",BHARATHAN,1971-01-01,M,,KALABHAVAN MANI,28.0
3404,20663735,Narasimham,NaT,,0 days 02:55:00,Malayalam,India,"Musical, Action, Drama, Bollywood",NANDAGOPAL MARAR,1951-09-07,M,1.78,MAMMOOTTY,48.0
3405,20663735,Narasimham,NaT,,0 days 02:55:00,Malayalam,India,"Musical, Action, Drama, Bollywood",MARANCHERI INDUCHOODAN,1960-05-21,M,1.72,MOHANLAL,39.0


In [None]:
# Initialize the spacy model
nlp = spacy.load('en_core_web_sm')

In [None]:
def get_film_name(wiki_id):
    filtered_data = mov_char_data[mov_char_data['Wikipedia_movie_ID'] == wiki_id]
    if not filtered_data.empty:
        return filtered_data['Movie_name'].iloc[0]
    return None  


In [None]:
# Function to tokenize the text
def tokenizer(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [None]:
# Function to map characters to their corresponding actors
def map_characters_to_actors(name):
    # Strip spaces and convert to lowercase to avoid case and whitespace issues
    name = name.strip().lower()
    actor_name = 'Unknown Actor'
    
    # Iterate through all character names in mov_char_data to check for partial matches
    for char_name in mov_char_data['Character_name']:
        char_name_normalized = char_name.strip().lower()  # Normalize the character name
        
        # Check if the character name contains the input name as a substring
        if name in char_name_normalized:
            actor_name = mov_char_data[mov_char_data['Character_name'] == char_name]['Actor_name'].iloc[0]
            break  # Break once a match is found
    
    return actor_name

In [None]:
# Function to extract Names and filter 
def extract_names(text):
    # Initialize a Counter to store name frequencies
    name_counts = Counter() 
    doc = nlp(text)

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            name_counts[ent.text] += 1
            
    # Consolidate similar names
    consolidated_counts = Counter()
    names = sorted(name_counts.keys(), key=len, reverse=True)  # Sort by length to prioritize full names

    for name in names:
        if any(name in longer_name and name != longer_name for longer_name in consolidated_counts):
            # If name is a substring of a longer name, add its count to the longer name
            for longer_name in consolidated_counts:
                if name in longer_name:
                    consolidated_counts[longer_name] += name_counts[name]
                    break
        else:
            consolidated_counts[name] = name_counts[name]

    # Sort names by count in descending order
    sorted_counts = consolidated_counts.most_common()  # Returns a list of (name, count) tuples sorted by count
    
    return sorted_counts
    

In [None]:
data = []
columns = ['Wikipedia ID','Film Name', 'Character Name','Actor Name', 'Count']

plots = content.strip().split("\n")  # Split the content by lines (each line is a film entry)
# Limit processing to the first 5 lines
plots_to_process = plots[:5]
for plot in plots_to_process:
    # Split each line into Wikipedia ID and Plot Summary
    film_data = plot.split("\t")

    if len(film_data) == 2:
        wikipedia_id = film_data[0]
        wikipedia_id = int(wikipedia_id)
        plot_summary = film_data[1]
        name_counts = extract_names(plot_summary)
        film_name = get_film_name(wikipedia_id)

        if film_name:  # Only process if film_name is not None
            for name_count in name_counts:
                name, count = name_count
                actor_name = map_characters_to_actors(name)
                data.append([wikipedia_id, film_name, name, actor_name, count])
        
# Create a dataframe to store the data
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,Wikipedia ID,Film Name,Character Name,Actor Name,Count
0,31186339,The Hunger Games,Peeta Mellark,JOSH HUTCHERSON,5
1,31186339,The Hunger Games,Katniss,JENNIFER LAWRENCE,4
2,31186339,The Hunger Games,Haymitch Abernathy,WOODY HARRELSON,3
3,31186339,The Hunger Games,Seneca Crane,WES BENTLEY,2
4,31186339,The Hunger Games,Snow,MEL BROOKS,2
5,31186339,The Hunger Games,Rue,ROBERT ENGLUND,2
6,31186339,The Hunger Games,Thresh,MARK HAMILL,1
7,31186339,The Hunger Games,baker,KEVIN DOWNES,1
8,31186339,The Hunger Games,Cato,VINCENZO TALARICO,1
9,20663735,Narasimham,Manapally Madhavan Nambiar,Unknown Actor,5
