In [1]:
# External imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
from tqdm import tqdm
import xml.etree.ElementTree as ET
import gzip
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [4]:
%store -r character_movies_filtered_imdb
df_prepared_copie = character_movies_filtered_imdb.copy()
df_prepared_copie = df_prepared_copie[['wikiID','releaseDate','charactName','name_movie']]

liste_wikiID_uniques = df_prepared_copie['wikiID'].unique()
# display(df_prepared)

## Extraction only of the summaries of interest to us

In [5]:
# Home directory
source_directory = 'corenlp_plot_summaries'

# Destination directory for files corresponding to wikiIDs
destination_directory = 'MovieSummaries/tried_summaries'

# Directory creation
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Get a list of files in the source directory
source_files = os.listdir(source_directory)

# Loop through the files in the source directory
for files in tqdm(source_files):
    # Checks if the file matches a wikiID in the list
    wikiID = files.replace('.xml.gz', '')
    if int(wikiID) in liste_wikiID_uniques:
        # Build complete source and destination file paths
        source_path = os.path.join(source_directory, files)
        destination_path = os.path.join(destination_directory, f"{wikiID}.txt.xml")

        # Unzip the gzip file
        with gzip.open(source_path, 'rt', encoding='utf-8') as f_in:
            # Copy the contents to a .txt.xml file in the new directory
            with open(destination_path, 'w', encoding='utf-8') as f_out:
                shutil.copyfileobj(f_in, f_out)

print("Extraction complete.")
# print(len(os.listdir(destination_directory)))

100%|██████████████████████████████████████| 5964/5964 [00:08<00:00, 662.76it/s]

Extraction complete.





In [7]:
print(len(os.listdir(destination_directory)))

5707


In [None]:
repertoire= 'MovieSummaries/tried_summaries'
dependencies = ["nsubj", "agent", "dobj", "nsubjpass", "iobj", "prep_*"]

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

for wikiID in tqdm(liste_wikiID_uniques[136:]):
    document = os.path.join(repertoire, str(wikiID) + '.txt.xml')
    if os.path.exists(document):
        # Access to XML file
        tree = ET.parse(document)
        root = tree.getroot()
        
        # Find the film's characters
        df_movie = df_prepared_copie[df_prepared_copie['wikiID']==wikiID]
        df_movie = df_movie.dropna(subset=['charactName'])
        characters_names = df_movie['charactName'].tolist()

        ##### Finding the characters' names ie entities that have a PERSON label. ##### 
        # Lists to store sentence IDs, token IDs, and words
        sentences_id_PER = []
        tokens_id_PER = []
        words_PER = []
        
        entities = []
        current_entity = []
        prev_sentence_id = None
        prev_token_id = None
        
        
        # Retrieve each sentence from the summary
        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.attrib.get('id')
            # Retrieve each word in the sentence
            for token in sentence.findall('.//token'):
                # Extract PER entities
                ner = token.find('NER').text
                if ner == "PERSON":
                    token_id = int(token.get('id'))
                    word = token.find('word').text
                    
                    # Group words into combined entities 
                    if prev_sentence_id != sentence_id or prev_token_id is None or token_id != prev_token_id + 1:
                        if current_entity:
                            entities.append(' '.join(current_entity))
                            current_entity = []
                    current_entity.append(word)
                    prev_sentence_id = sentence_id
                    prev_token_id = token_id
                    
        # Add the last entity if it exists
        if current_entity:
            entities.append(' '.join(current_entity))            

        normalized_entities = {}
        
        if len(entities) != 0:
            for name in entities:
                #Normalize each entity so that it is the same as the datafream entity
                normalized_name = next((char_name for char_name in characters_names if name in char_name), name)
                
                # Count their number of recurrences
                if normalized_name in normalized_entities:
                    normalized_entities[normalized_name] += 1
                else:
                    normalized_entities[normalized_name] = 1
                    
            # Calculate the total number of appearances
            total_appearances = sum(normalized_entities.values())

            # Calculate the percentage of appearances for each name
            appearance_percentages = {name: round((count / total_appearances) * 100, 2) for name, count in normalized_entities.items()}
            
            # Calculate the maximal number of appearances
            maximal_value = max(normalized_entities.values())
            
            # Calculate score of appearances for each name
            appearance_importance = {name: round((count / maximal_value), 2) for name, count in normalized_entities.items()}
                
        
        
        ##### Finding the polarity_scores for each name. ##### 
        active_actions = {}
        passive_actions = {}
        attributes = {}
        
        # Retrieve each sentence from the summary
        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.attrib.get('id')
                
            for dep in sentence.findall('.//collapsed-ccprocessed-dependencies/dep'):
                all_words = sentence.findall('.//token')

                # Get information on dependencies
                dep_type = dep.get('type')
                governor = dep.find('governor').text
                dependent = dep.find('dependent').text
                dependent_id = int(dep.find('dependent').get('idx'))
                governor_id = int(dep.find('governor').get('idx'))

                # Get information on governor
                word = all_words[governor_id-1].find('word').text
                pos_tag = all_words[governor_id-1].find('POS').text
                
                if word != governor:
                    print('error!!')
                    
                # Find verbs with particular dependencies
                if pos_tag.startswith('V'):
                    if dep_type in dependencies:
                        # Find out if the verb is an action of a character
                        for name in characters_names:
                            if (dep_type == "nsubj" or dep_type == "agent") and (governor == word) and (dependent in name):
                                if name not in active_actions:
                                    active_actions[name] = []
                                active_actions[name].append(word)
                                
                            # Find out if the verb is an action on a character
                            elif (dep_type in ["dobj", "nsubjpass", "iobj", "prep_*"]) and (governor == word) and (dependent in name):
                                if name not in passive_actions:
                                    passive_actions[name] = []
                                passive_actions[name].append(word)

                # Find the word that describes a character
                if (pos_tag == "JJ" or pos_tag == "VBG" or pos_tag == "NN"):
                    for name in characters_names:#itérer sur tout les perso
                        if ((dep_type == "nsubj" or dep_type == "appos") and (governor == word) and (dependent in name)):
                            if name not in attributes:
                                attributes[name] = []
                            attributes[name].append(word)
                
                # Get information on dependent
                word = all_words[dependent_id-1].find('word').text
                pos_tag = all_words[dependent_id-1].find('POS').text
                # Find the word that describes a character
                for name in characters_names:
                    if (dep_type == "nsubj" or dep_type == "amod" or dep_type == "nn" or dep_type == "appos") and (governor in name) and (dependent == word):
                        if name not in attributes:
                            attributes[name] = []
                        attributes[name].append(word)
        
        # Complete fream data
        for index, row in df_movie.iterrows():
            df_prepared_copie.loc[index, 'role_summary_percent'] = appearance_percentages.get(row['charactName'], 0)
            df_prepared_copie.loc[index, 'role_importance'] = appearance_importance.get(row['charactName'], 0)
           
            actif_v = " ".join(active_actions.get(row['charactName'], []))
            pasif_v = " ".join(passive_actions.get(row['charactName'], []))
            adj = " ".join(attributes.get(row['charactName'], []))
    
            df_prepared_copie.loc[index, 'comp_active'] = sia.polarity_scores(actif_v)["compound"]
            df_prepared_copie.loc[index, 'comp_pasive'] = sia.polarity_scores(pasif_v)["compound"]
            df_prepared_copie.loc[index, 'comp_attribut'] = sia.polarity_scores(adj)["compound"]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/xenia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
 97%|█████████████████████████████████████▉ | 5845/6018 [04:59<00:12, 13.67it/s]

In [None]:
df_prepared_copie[df_prepared['wikiID']==31186339]

In [None]:
df_prepared_copie.to_csv('caracter.csv')