In [2]:
# External imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
from tqdm import tqdm
import xml.etree.ElementTree as ET
import gzip
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
%store -r character_movies_filtered_imdb
df_prepared_copie = character_movies_filtered_imdb.copy()
liste_wikiID_uniques = df_prepared_copie['wikiID'].unique()
# display(df_prepared)

## Extraction only of the summaries of interest to us

Our primary dataset consists of 42,306 movie plot summaries from the English Wikipedia.
These summaries briefly describe the movie plots and include character descriptions. To analyze this data, we used the preexisting Stanford CoreNLP preprocessed summaries. Those files were already tagged, parsed and each token was already associated with its proper Named Entity and coreference. 
After converting the data from XML files to a structured format, we will be able to extract specific linguistic characteristics associated with each character of the movies that we filtered.

In [None]:
# Home directory
source_directory = 'corenlp_plot_summaries'

# Destination directory for files corresponding to wikiIDs
destination_directory = 'MovieSummaries/tried_summaries'

# Directory creation
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Get a list of files in the source directory
source_files = os.listdir(source_directory)

# Loop through the files in the source directory
for files in tqdm(source_files):
    # Checks if the file matches a wikiID in the list
    wikiID = files.replace('.xml.gz', '')
    if int(wikiID) in liste_wikiID_uniques:
        # Build complete source and destination file paths
        source_path = os.path.join(source_directory, files)
        destination_path = os.path.join(destination_directory, f"{wikiID}.txt.xml")

        # Unzip the gzip file
        with gzip.open(source_path, 'rt', encoding='utf-8') as f_in:
            # Copy the contents to a .txt.xml file in the new directory
            with open(destination_path, 'w', encoding='utf-8') as f_out:
                shutil.copyfileobj(f_in, f_out)

print("Extraction complete.")
# print(len(os.listdir(destination_directory)))

In [None]:
print(len(os.listdir(destination_directory)))

### How does a big hit influence an actor's career in terms of roles that he can play ?

<u>PART 1 : Analysis of actor's careers and their relationship with movies that reached big hits</u>
What we found interesting is to compare the category of role (percentage of roles' importance) the actors play in before and after a big hit. Here will be our method : 
- Analysis of plot_summaries : We'll extract the quoted characters, ie character names repeated in synopsis (entities that have a PERSON label), and find the percentage of repetition of each name.
- Find the actor names associated with the found characters and see if they ever played in a big hit movie and what was their age

<u>PART 2 : Defining actors' roles in function of their context in plot summaries</u>

A character can be defined in function of his actions and the characteristics that are attributed to him. To find this, we based our method on the one presented in the paper Learning Latent Personas of Film Characters, written by David Bamman, Brendan O’Connor and Noah A. Smith where the classification of characters was as follows.

The classification is based on the dependencies and link between words (whether governor or dependent) found in XML files :
- **Actions the character has done** : ie the verb has a dependency "nsubj" (nominal subject : a noun phrase which is the syntactic subject of a clause) or "agent" (agent : complement of a passive verb which is introduced by the preposition “by” and does the
action) with character's name
- **Actions he is subject to** : ie the verb has a dependency "nsubjpass" (nsubjpass : passive nominal subject ie subject of a passive clause), "iobj" (iobj : indirect object), "prep_*"(if preposition starts with prep) or "dobj" (dobj : direct object) with the character's name
- **Attributes** : Adjectives and common noun words such that we are in one of these cases
    - dependency = "nsubj" or "appos" / attribute = governor / character's name = dependent
    - dependency = "nsubj", "amod", "nn" or "appos" / attribute = dependent / character's name = governor
    
After having associated each character with his actions and attributes, we can perform a sentiment analysis on them to know if they were rather classified with positive or negative actions/attributes.

In [5]:
repertoire= 'MovieSummaries/tried_summaries'
dependencies = ["nsubj", "agent", "dobj", "nsubjpass", "iobj"]

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

for wikiID in tqdm(liste_wikiID_uniques[136:]):
    document = os.path.join(repertoire, str(wikiID) + '.txt.xml')
    if os.path.exists(document):
        # Access to XML file
        tree = ET.parse(document)
        root = tree.getroot()
        
        # Find the film's characters
        df_movie = df_prepared_copie[df_prepared_copie['wikiID']==wikiID]
        df_movie_drop = df_movie.dropna(subset=['charactName'])
        characters_names = df_movie_drop['charactName'].tolist()

        ##### Finding the characters' names ie entities that have a PERSON label. ##### 
        # Lists to store sentence IDs, token IDs, and words
        sentences_id_PER = []
        tokens_id_PER = []
        words_PER = []
        
        entities = []
        current_entity = []
        prev_sentence_id = None
        prev_token_id = None
        
        
        # Retrieve each sentence from the summary
        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.attrib.get('id')
            # Retrieve each word in the sentence
            for token in sentence.findall('.//token'):
                # Extract PER entities
                ner = token.find('NER').text
                if ner == "PERSON":
                    token_id = int(token.get('id'))
                    word = token.find('word').text
                    
                    # Group words into combined entities 
                    if prev_sentence_id != sentence_id or prev_token_id is None or token_id != prev_token_id + 1:
                        if current_entity:
                            entities.append(' '.join(current_entity))
                            current_entity = []
                    current_entity.append(word)
                    prev_sentence_id = sentence_id
                    prev_token_id = token_id
                    
        # Add the last entity if it exists
        if current_entity:
            entities.append(' '.join(current_entity))            

        normalized_entities = {}
        
        if len(entities) != 0:
            for name in entities:
                #Normalize each entity so that it is the same as the datafream entity
                normalized_name = next((char_name for char_name in characters_names if name in char_name), name)
                
                # Count their number of recurrences
                if normalized_name in normalized_entities:
                    normalized_entities[normalized_name] += 1
                else:
                    normalized_entities[normalized_name] = 1
                    
            # Calculate the total number of appearances
            total_appearances = sum(normalized_entities.values())

            # Calculate the percentage of appearances for each name
            appearance_percentages = {name: round((count / total_appearances) * 100, 2) for name, count in normalized_entities.items()}
            
            # Calculate the maximal number of appearances
            maximal_value = max(normalized_entities.values())
            
            # Calculate score of appearances for each name
            appearance_importance = {name: round((count / maximal_value), 2) for name, count in normalized_entities.items()}
                
        
        
        ##### Finding the polarity_scores for each name. ##### 
        active_actions = {}
        passive_actions = {}
        attributes = {}
        
        # Retrieve each sentence from the summary
        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.attrib.get('id')
                
            for dep in sentence.findall('.//collapsed-ccprocessed-dependencies/dep'):
                all_words = sentence.findall('.//token')

                # Get information on dependencies
                dep_type = dep.get('type')
                governor = dep.find('governor').text
                dependent = dep.find('dependent').text
                dependent_id = int(dep.find('dependent').get('idx'))
                governor_id = int(dep.find('governor').get('idx'))

                # Get information on governor
                word = all_words[governor_id-1].find('word').text
                pos_tag = all_words[governor_id-1].find('POS').text
                
                if word != governor:
                    print('error!!')
                    
                # Find verbs with particular dependencies
                if pos_tag.startswith('V'):
                    if (dep_type in dependencies) or dep_type.startswith('prep_') :
                        # Find out if the verb is an action of a character
                        for name in characters_names:
                            if (dep_type == "nsubj" or dep_type == "agent") and (governor == word) and (dependent in name):
                                if name not in active_actions:
                                    active_actions[name] = []
                                active_actions[name].append(word)
                                
                            # Find out if the verb is an action on a character
                            elif ((dep_type in ["dobj", "nsubjpass", "iobj"]) or dep_type.startswith('prep_')) and (governor == word) and (dependent in name):
                                if name not in passive_actions:
                                    passive_actions[name] = []
                                passive_actions[name].append(word)

                # Find the word that describes a character
                if (pos_tag == "JJ" or pos_tag == "VBG" or pos_tag == "NN"):
                    for name in characters_names:#itérer sur tout les perso
                        if ((dep_type == "nsubj" or dep_type == "appos") and (governor == word) and (dependent in name)):
                            if name not in attributes:
                                attributes[name] = []
                            attributes[name].append(word)
                
                # Get information on dependent
                word = all_words[dependent_id-1].find('word').text
                pos_tag = all_words[dependent_id-1].find('POS').text
                # Find the word that describes a character
                for name in characters_names:
                    if (dep_type == "nsubj" or dep_type == "amod" or dep_type == "nn" or dep_type == "appos") and (governor in name) and (dependent == word):
                        if name not in attributes:
                            attributes[name] = []
                        attributes[name].append(word)
        
        # Complete fream data
        for index, row in df_movie.iterrows():
            df_prepared_copie.loc[index, 'role_summary_percent'] = appearance_percentages.get(row['charactName'], 0)
            df_prepared_copie.loc[index, 'role_importance'] = appearance_importance.get(row['charactName'], 0)
           
            actif_v = " ".join(active_actions.get(row['charactName'], []))
            pasif_v = " ".join(passive_actions.get(row['charactName'], []))
            adj = " ".join(attributes.get(row['charactName'], []))
    
            df_prepared_copie.loc[index, 'comp_active'] = sia.polarity_scores(actif_v)["compound"]
            df_prepared_copie.loc[index, 'comp_pasive'] = sia.polarity_scores(pasif_v)["compound"]
            df_prepared_copie.loc[index, 'comp_attribut'] = sia.polarity_scores(adj)["compound"]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/xenia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
100%|███████████████████████████████████████| 6008/6008 [06:02<00:00, 16.59it/s]


In [6]:
df_prepared_copie[df_prepared_copie['wikiID']==1617626]

Unnamed: 0,wikiID,movieID,releaseDate,charactName,birth,gender,height,ethnicity,name_actor,age,...,languages,countries,genres,year,averageRating,role_summary_percent,role_importance,comp_active,comp_pasive,comp_attribut
37413,1617626,/m/05h12k,1994-08-03,,1930-12-26,M,1.85,,Donald Moffat,63.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0
37414,1617626,/m/05h12k,1994-08-03,Jack Ryan,1942-07-13,M,1.85,/m/01qhm_,Harrison Ford,52.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,30.67,1.0,-0.0516,-0.3612,-0.1027
37415,1617626,/m/05h12k,1994-08-03,John Clark,1955-07-22,M,1.78,,Willem Dafoe,39.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,10.67,0.35,-0.8481,0.0,0.0
37416,1617626,/m/05h12k,1994-08-03,Cathy Muller Ryan,1947-08-24,F,1.7,,Anne Archer,46.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,-0.0516,-0.3612,-0.1027
37417,1617626,/m/05h12k,1994-08-03,,1931-01-17,M,1.87,/m/0x67,James Earl Jones,63.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0
37418,1617626,/m/05h12k,1994-08-03,,1949-11-01,F,,,Belita Moreno,44.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0
37419,1617626,/m/05h12k,1994-08-03,Captain Ramirez,1963-12-16,M,1.88,/m/09vc4s,Benjamin Bratt,30.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0
37420,1617626,/m/05h12k,1994-08-03,Robert Ritter,1959-02-08,M,1.77,,Henry Czerny,35.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,12.0,0.39,-0.2732,0.0,0.3182
37421,1617626,/m/05h12k,1994-08-03,,1957-03-15,M,1.715,/m/02p4q5p,Joaquim de Almeida,37.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0
37422,1617626,/m/05h12k,1994-08-03,,1951-11-16,M,1.75,,Miguel Sandoval,42.0,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio...",1994.0,6.9,0.0,0.0,0.0,0.0,0.0


In [7]:
df_prepared_copie.to_csv('caracter.csv', index = False)
%store character_movies_filtered_imdb

Stored 'character_movies_filtered_imdb' (DataFrame)
