In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import math
from scipy import stats
from sklearn import linear_model
from tqdm import tqdm
import openai

# Loading the data

In [2]:
movies = pd.read_pickle('../../../data/movies.pkl')
plot_summaries = pd.read_pickle('../../../data/plot_summaries.pkl')
tvtropes = pd.read_pickle('../../../data/tvtropes.pkl')

In [3]:
characters = pd.read_csv('../../../data/character.metadata.tsv', sep='\t')
characters_columns = ['Wikipedia_Movie_ID', 'Freebase_Movie_ID', 'Movie_Release_Date', 'Character_Name', 'Actor_DOB', 'Actor_Gender', 'Actor_Height', 'Actor_Ethnicity', 'Actor_Name', 'Actor_Age_At_Movie_Release', 'Freebase_Character_Actor_Map_ID', 'Freebase_character_ID', 'Freebase_Actor_ID']
characters.columns = characters_columns

# Convert movie release date to movie release year
movies['Movie_release_date'] = pd.to_datetime(movies['Movie_release_date'], format = 'mixed', errors='coerce', utc=True)
movies['Movie_Release_Year'] = movies['Movie_release_date'].dt.year
movies['Movie_Release_Year'] = movies['Movie_Release_Year'].fillna(0.0).astype(int)

# Reducing the dataframe to keep only rows of interest

We want to know wether the characters of the movies are positively or negatively seen. We can therefore remove all the rows for which we don't have any character, and all the rows that are just duplicates. 

In [4]:
characters_name_df = characters[['Character_Name', 'Wikipedia_Movie_ID']]
characters_name_df = characters_name_df.dropna()
print("Length of characters_name_df after removing empty values: ", len(characters_name_df))
# drop duplicates in the character names column
characters_name_df = characters_name_df.drop_duplicates(subset='Character_Name')
print("Length of characters_name_df after removing duplicates: ", len(characters_name_df))

Length of characters_name_df after removing empty values:  192793
Length of characters_name_df after removing duplicates:  126628


We can also remove all the rows of the characters that do not appear in the plot summary. To do this, we first merge the plot summaries dataframe and the character names dataframe based on their Wikipedia_Movie_ID.

In [5]:
plot_summaries = plot_summaries.rename(columns={'Wikipedia_movie_ID': 'Wikipedia_Movie_ID'})
names_and_plot = pd.merge(characters_name_df, plot_summaries, on='Wikipedia_Movie_ID', how='inner')

We now check for each character name wether it appears in the plot summary or not.

In [6]:
def check_character_in_plot(row):

    if pd.isna(row['Plot']) or pd.isna(row['Character_Name']):
        return False # If the plot or character name is missing, return False
    
    character_words = row['Character_Name'].split() # We want to check if at least a part of the 
                                                    # character name is in the plot, for example 
                                                    # "Watson" in "Dr. Watson". Note : no risk of
                                                    # splitting the '.' alone, the split function
                                                    # splits on spaces by default.

    for word in character_words:
        if word.lower() in row['Plot'].lower(): # We make sure to detect the character name, 
                                                # regardless of the case
            return True

    return False

names_and_plot['Character_in_plot'] = names_and_plot.apply(check_character_in_plot, axis=1)

Finally, we remove all the rows for which the character name does not appear in the plot summary.

In [7]:
names_and_plot = names_and_plot[names_and_plot['Character_in_plot'] == True]
names_and_plot = names_and_plot.drop(columns=['Character_in_plot'])

According to the movies genre analysis, we observed that certain specific genres have clear peaks during the period of the World War II. We will focus on these genres for our character analysis. 

TODO : Expliquer que l'idée c'est qu'on a vu qu'il y avait un pic pour certains genres de films. Maintenant qu'on a vu qu'il y a un effet sur le nombre de films dans certains genre, on veut voir si parmi ces films il y a un effet sur les personnages.

In [8]:
movies = movies.rename(columns={'Wikipedia_movie_ID': 'Wikipedia_Movie_ID'})
names_plot_genres = pd.merge(names_and_plot, movies[['Wikipedia_Movie_ID', 'Movie_genres', 'Movie_Release_Year', 'Movie_name']], on='Wikipedia_Movie_ID', how='inner')

In [9]:
def select_genre(df, genre):
    return df[df['Movie_genres'].apply(lambda x: x is not None and genre in x)]

def select_genres(df, genres):
    return df[df['Movie_genres'].apply(lambda x: x is not None and any(genre in x for genre in genres))]

def select_period(df, start, end):
    return df[(df['Movie_Release_Year'] >= start) & (df['Movie_Release_Year'] <= end)]

We are interested in the following genres, between 1930 and 1955 : Propaganda film, Combat Films, Suspense, Documentary, Psychological thriller, History, War film, Film noir.

In [10]:
selected_period = names_plot_genres.copy()
selected_period = select_period(selected_period, 1930, 1955)

selected_genres = select_genres(selected_period, ['Propaganda film', 'Combat Films', 'Suspense', 
                                                  'Documentary', 'Psychological thriller', 'History', 
                                                  'War film', 'Film noir'])

print("Final length of the dataset: ", len(selected_genres))

Final length of the dataset:  1116


For each characters of this preprocessed dataset, we want to know whether they are positively or negatively seen. The plot summary is not sufficient to do a meaningful sentiment analysis on the characters, so we will use an LLM that take into account extra information about the characters. We use the api of the LLM gpt, with the following prompt : 

"-1 means negatively seen character. 0 means neutral. 1 means positively seen character. Please rate {character_name} from the movie {movie_name}. You can use your knowledge about the movie. One word answer"

Note : we removed from this notebook the api key we are using because it is a confidential key.

In [11]:
openai.api_key = 'ENTER-A-KEY-HERE'

In [12]:
def get_character_perception(character_name, movie_name):
    prompt = f"-1 means negatively seen character. 0 means neutral. 1 means positively seen character. Please rate {character_name} from the movie {movie_name}. You can use your knowledge about the movie. One word answer"

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",  
            messages=[{"role": "user", "content": prompt}],
            max_tokens=50,  
            temperature=0.05  # Controls the randomness of the output, here we prefere a deterministic answer over an expressive one
        )

        perception_score = response.choices[0].message['content'].strip()

        try:
            score = int(perception_score)
            if -10 <= score <= 10:
                return score
            else:
                return "Error: Invalid score range"
        except ValueError:
            return "Error: Unable to interpret response"
    
    except openai.error.RateLimitError as e:
        return "Error: Rate limit exceeded. Please try again later."
    except Exception as e:
        return f"Error: {e}"
    

def add_perception_score(df):
    df['Perception_score'] = df.apply(lambda x: get_character_perception(x['Character_Name'], x['Movie_name']), axis=1)
    return df

In [13]:
# df_with_scores = add_perception_score(selected_genres)

We keep only the characters that are negatively perceived. 

In [14]:
df_with_scores = pd.read_csv('../../../data/dataset_after_api_call.csv')
villain_analysis = df_with_scores[df_with_scores['Perception_score'] < 0]

Among the negatively seen characters, we want to see the words and characteristics associated with these characters on the plot summaries. To do this, we use a traditional nlp procedure as seen in class.

In [15]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import WordCloud

In [104]:
def get_associated_words(character_name, plot):

    associated_words = []
    characters_name_parts = character_name.lower().split()

    nlp = spacy.load("en_core_web_sm") # load the spacy model in english
    nlp_coref = spacy.load("en_coreference_web_trf") # load the spacy model with coreference resolution
    nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
    nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
    nlp.add_pipe("coref", source=nlp_coref)
    nlp.add_pipe("span_resolver", source=nlp_coref)

    doc = nlp(plot) # Tokenization
    coref_dict = doc.spans

    nb_clusters = int(len(coref_dict)/2)
    corref_clusters = []

    for i in range(nb_clusters):
        corref_list = coref_dict[f'coref_head_clusters_{i+1}']
        corref_list = [token.text for token in corref_list]
        corref_clusters.append(corref_list)

    for corref_words in corref_clusters:
        corref_words = [word.lower() for word in corref_words]
        if any(name_part in corref_words for name_part in characters_name_parts):
            corref_list = corref_words
            break

    corref_list = [word for word in corref_list if word not in nlp.Defaults.stop_words]
    characters_name_parts.extend(corref_list)
    characters_name_parts = list(set(characters_name_parts))

    sentences = [sent.text for sent in doc.sents]

    for sentence in sentences:

        sentence_text = sentence.lower()

        if any(name_part in sentence_text for name_part in characters_name_parts): # we want to focus on the sentences where the character appears
        
            for token in nlp(sentence_text):

                token_word = token.text
                token_head = token.head.text
                token_children = [child.text for child in token.children]

                # if token_word is refering the character, we want to keep the head and the children
                if token_word.lower() in characters_name_parts:
                    associated_words.append(token_head)
                    associated_words.extend(token_children)

                # if token_word is an adjective, we want to keep it if the head is refering the character
                if token.pos_ == 'ADJ' and token_head.lower() in characters_name_parts:
                    associated_words.append(token_word)

                # if one of the children is reffering the character, we want to keep the other children
                for child in token_children:
                    if child.lower() in characters_name_parts:
                        associated_words.extend(token_children)


    associated_words = [word.lower() for word in associated_words]
    associated_words.extend(characters_name_parts)

    # We want to remove from the list : character name, stopwords, punctuation, and duplicates
    associated_words = list(set(associated_words))
    associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
    associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
    associated_words = [word for word in associated_words if word.isalpha()]

    return associated_words

# WAAAAAAOUUUUTCH

In [16]:
# PREPROCESSING

def get_associated_words(character_name, plot):

    nlp = spacy.load("en_core_web_sm") # load the spacy model in english
    nlp_coref = spacy.load("en_coreference_web_trf") # load the spacy model with coreference resolution
    nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
    nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
    nlp.add_pipe("coref", source=nlp_coref)
    nlp.add_pipe("span_resolver", source=nlp_coref)

    plot = " ".join(plot.split()) # make sure there is only one space between words
    plot = plot.lower() # lowercase
    doc = nlp(plot) # Tokenization

    coref_dict = doc.spans
    
    associated_words = []
    characters_name_parts = character_name.lower().split()

    sentences = [sent.text for sent in doc.sents]

    for sentence in sentences:

        sentence_doc = nlp(sentence)
        sentence_text = sentence.lower()

        

        keep_sentence = (
            any(name_part in sentence_text for name_part in characters_name_parts) or
            any(token.pos_ == "PRON" and token._.in_coref for token in sentence_doc)
        )

        if character_name.lower() in sentence: # we want to focus on the sentences where the character appears
            for token in nlp(sentence):
                token_word = token.text
                token_head = token.head.text
                token_children = [child.text for child in token.children]

                # if token_word is the character name, we want to keep the head and the children
                if token_word.lower() == character_name.lower():
                    associated_words.append(token_head)
                    associated_words.extend(token_children)

                # if token_word is an adjective, we want to keep it if the head is the character name
                if token.pos_ == 'ADJ' and token_head.lower() == character_name.lower():
                    associated_words.append(token_word)

    # We want to remove from the list : character name, stopwords, punctuation, and duplicates
    associated_words = list(set(associated_words))
    associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
    associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
    associated_words = [word for word in associated_words if word.isalpha()]
    
    return associated_words

# WATCHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [37]:
character_name = "Paul Watson"
plot = "Paul is brilliant. He is a journalist. He is amazing. This journalist loves pizzas. Alfred is a friend of Paul. He is a good friend."


associated_words = []
characters_name_parts = character_name.lower().split()

nlp = spacy.load("en_core_web_sm") # load the spacy model in english
nlp_coref = spacy.load("en_coreference_web_trf") # load the spacy model with coreference resolution
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])
nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

doc = nlp(plot) # Tokenization
coref_dict = doc.spans

In [63]:
nb_clusters = int(len(coref_dict)/2)
corref_clusters = []

for i in range(nb_clusters):
    corref_list = coref_dict[f'coref_head_clusters_{i+1}']
    corref_list = [token.text for token in corref_list]
    corref_clusters.append(corref_list)

for corref_words in corref_clusters:
    corref_words = [word.lower() for word in corref_words]
    if any(name_part in corref_words for name_part in characters_name_parts):
        corref_list = corref_words
        break

corref_list = [word for word in corref_list if word not in nlp.Defaults.stop_words]
characters_name_parts.extend(corref_list)
characters_name_parts = list(set(characters_name_parts))

sentences = [sent.text for sent in doc.sents]

In [70]:
characters_name_parts

['watson', 'paul', 'journalist']

In [None]:
for sentence in sentences:

    sentence_text = sentence.lower()

    if any(name_part in sentence_text for name_part in characters_name_parts): # we want to focus on the sentences where the character appears

        nb = 1
        print("SENTENCE NUMBER", nb)
    
        for token in nlp(sentence_text):

            token_word = token.text
            token_head = token.head.text
            token_children = [child.text for child in token.children]

            print("TOKEN WORD", token_word)
            print("TOKEN HEAD", token_head)
            print("TOKEN CHILDREN", token_children) 
                

            # if token_word is refering the character, we want to keep the head and the children
            if token_word.lower() in characters_name_parts:
                associated_words.append(token_head)
                associated_words.extend(token_children)

                print(token_word, "is refering the character !")
                print("We aWWWWdd to the associated words the following words:", token_head, token_children)

            # if token_word is an adjective, we want to keep it if the head is refering the character
            if token.pos_ == 'ADJ' and token_head.lower() in characters_name_parts:
                associated_words.append(token_word)
                print(token_word, "is an adjective refering the character !")
                print("We add to the associated words the following word:", token_word)

            # if one of the children is reffering the character, we want to keep the other children
            for child in token_children:
                if child.lower() in characters_name_parts:
                    associated_words.extend(token_children)
                    print(child, "is refering the character !")
        nb += 1


associated_words = [word.lower() for word in associated_words]
associated_words.extend(characters_name_parts)
# We want to remove from the list : character name, stopwords, punctuation, and duplicates
associated_words = list(set(associated_words))
associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
associated_words = [word for word in associated_words if word.isalpha()]


SENTENCE NUMBER 1
TOKEN WORD paul
TOKEN HEAD is
TOKEN CHILDREN []
paul is refering the character !
We aWWWWdd to the associated words the following words: is []
TOKEN WORD is
TOKEN HEAD is
TOKEN CHILDREN ['paul', 'brilliant', '.']
paul is refering the character !
TOKEN WORD brilliant
TOKEN HEAD is
TOKEN CHILDREN []
TOKEN WORD .
TOKEN HEAD is
TOKEN CHILDREN []
SENTENCE NUMBER 1
TOKEN WORD he
TOKEN HEAD is
TOKEN CHILDREN []
TOKEN WORD is
TOKEN HEAD is
TOKEN CHILDREN ['he', 'journalist', '.']
journalist is refering the character !
TOKEN WORD a
TOKEN HEAD journalist
TOKEN CHILDREN []
TOKEN WORD journalist
TOKEN HEAD is
TOKEN CHILDREN ['a']
journalist is refering the character !
We aWWWWdd to the associated words the following words: is ['a']
TOKEN WORD .
TOKEN HEAD is
TOKEN CHILDREN []
SENTENCE NUMBER 1
TOKEN WORD this
TOKEN HEAD journalist
TOKEN CHILDREN []
TOKEN WORD journalist
TOKEN HEAD loves
TOKEN CHILDREN ['this']
journalist is refering the character !
We aWWWWdd to the associated wo

In [102]:
characters_name_parts

['watson', 'paul', 'journalist']

In [103]:
associated_words

['watson', 'paul', 'journalist', 'brilliant', 'loves', 'pizzas']

In [69]:
characters_name_parts

['watson', 'paul', 'journalist']

In [68]:
sentences

['Paul is brilliant.',
 'He is a journalist.',
 'He is amazing.',
 'This journalist loves pizzas.',
 'Alfred is a friend of Paul.',
 'He is a good friend.']

In [65]:
# for sentence in sentences:

#     sentence_doc = nlp(sentence)
#     sentence_text = sentence.lower()

#     if character_name.lower() in sentence: # we want to focus on the sentences where the character appears
#         for token in nlp(sentence):
#             token_word = token.text
#             token_head = token.head.text
#             token_children = [child.text for child in token.children]

#             # if token_word is the character name, we want to keep the head and the children
#             if token_word.lower() == character_name.lower():
#                 associated_words.append(token_head)
#                 associated_words.extend(token_children)

#             # if token_word is an adjective, we want to keep it if the head is the character name
#             if token.pos_ == 'ADJ' and token_head.lower() == character_name.lower():
#                 associated_words.append(token_word)

# # We want to remove from the list : character name, stopwords, punctuation, and duplicates
# associated_words = list(set(associated_words))
# associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
# associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
# associated_words = [word for word in associated_words if word.isalpha()]


  with torch.cuda.amp.autocast(self._mixed_precision):


In [31]:
corref_list

['Alfred', 'He']

In [22]:
characters_name_parts

corref_list

corref_clusters

[['paul', 'he', 'he', 'paul'], ['journalist', 'journalist'], ['alfred', 'he']]

# BLAAAAAAAAAAAAAAAAAAAAAA

# DRAFT NLP

In [None]:
character_name = "Evelyn Sharp"
characters_name_parts = character_name.lower().split()

print(characters_name_parts)

In [None]:
character_name = "Evelyn Sharp"

plot = """The cunning detective, Evelyn Sharp, is tasked with solving a mysterious series of murders 
            in a small town. The local police are baffled, and there is tension in the air as each new 
            murder brings the town closer to chaos. The detective's sharp instincts lead her to suspect 
            the charming but secretive businessman, Victor Blackwell, who seems to be hiding something. 
            Meanwhile, Victor's associate, Lena Monroe, insists that he's innocent, but her odd behavior 
            raises suspicion. As Evelyn digs deeper, she uncovers a web of lies and deceit. Lena, however, 
            is not what she seems and may have a hidden agenda. Evelyn must navigate these lies carefully 
            while dealing with the town's growing fear. In the end, she discovers that the real mastermind 
            is Lena, whose jealousy and betrayal set the stage for the murders. As the truth comes to light, 
            Evelyn faces a moral dilemma — should she reveal the whole story, or protect some of the innocent 
            who were unwittingly involved?"""

nlp = spacy.load("en_coreference_web_trf") # load the spacy model in english
plot = " ".join(plot.split()) # make sure there is only one space between words
plot = plot.lower() # lowercase
doc = nlp(plot) # Tokenization

associated_words = []

sentences = [sent.text for sent in doc.sents]

for sentence in sentences:
    if character_name.lower() in sentence: # we want to focus on the sentences where the character appears
        for token in nlp(sentence):
            token_word = token.text
            token_head = token.head.text
            token_children = [child.text for child in token.children]


            # if token_word is the character name, we want to keep the head and the children
            if token_word.lower() == character_name.lower():
                associated_words.append(token_head)
                associated_words.extend(token_children)

            # if token_word is an adjective, we want to keep it if the head is the character name
            if token.pos_ == 'ADJ' and token_head.lower() == character_name.lower():
                associated_words.append(token_word)

            # if the character name is in a list of children, we want to keep the head and the other children
            if character_name.lower() in token_children:
                associated_words.append(token_head)
                associated_words.extend([child for child in token_children if child.lower() != character_name.lower()])

# We want to remove from the list : character name, stopwords, punctuation, and duplicates
associated_words = list(set(associated_words))
associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
associated_words = [word for word in associated_words if word.isalpha()]

print(associated_words)


# DRAFT NLP GPT

In [None]:
character_name = "Paul"

plot = "Paul is brilliant. He is a journalist. He is amazing. This journalist loves pizzas. Alfred is a friend of Paul. He is a good friend."

nlp = spacy.load("en_core_web_sm") # load the spacy model in english
nlp_corref = spacy.load("en_coreference_web_trf") # load the spacy model with coreference resolution
coref_clusters = nlp_corref(plot).spans

plot = " ".join(plot.split()) # make sure there is only one space between words
plot = plot.lower() # lowercase
doc = nlp(plot) # Tokenization

associated_words = []

sentences = [sent.text for sent in doc.sents]

for sentence in sentences:
    if character_name.lower() in sentence: # we want to focus on the sentences where the character appears
        for token in nlp(sentence):
            token_word = token.text
            token_head = token.head.text
            token_children = [child.text for child in token.children]


            # if token_word is the character name, we want to keep the head and the children
            if token_word.lower() == character_name.lower():
                associated_words.append(token_head)
                associated_words.extend(token_children)

            # if token_word is an adjective, we want to keep it if the head is the character name
            if token.pos_ == 'ADJ' and token_head.lower() == character_name.lower():
                associated_words.append(token_word)

            # if the character name is in a list of children, we want to keep the head and the other children
            if character_name.lower() in token_children:
                associated_words.append(token_head)
                associated_words.extend([child for child in token_children if child.lower() != character_name.lower()])

# We want to remove from the list : character name, stopwords, punctuation, and duplicates
associated_words = list(set(associated_words))
associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
associated_words = [word for word in associated_words if word.isalpha()]

print(associated_words)



In [None]:
import spacy

plot = "The glorious cats were startled by the ugly dog as it growled at them."
character_name = "cats"


nlp = spacy.load("en_coreference_web_trf")  # load the spacy model in English
plot = " ".join(plot.split())  # make sure there is only one space between words
plot = plot.lower()  # lowercase the text
doc = nlp(plot)  # Tokenization

associated_words = []

# Get sentences
sentences = [sent.text for sent in doc.sents]

# Process coreference (to handle pronouns and other referring terms)
coref_clusters = doc._.coref_clusters if doc._.has_annotation("coref_clusters") else []

# Generate a list of referring words (pronouns or common referring words)
referring_words = set()
for cluster in coref_clusters:
    for mention in cluster:
        referring_words.add(mention.text.lower())  # add referring words like "he", "she", etc.

referring_words.add(character_name.lower())  # Add the character's full name

# Iterate over sentences
for sentence in sentences:
    # Check if the sentence contains part of the character's name or a referring word
    if any(word in sentence.lower() for word in referring_words):
        for token in nlp(sentence):
            token_word = token.text
            token_head = token.head.text
            token_children = [child.text for child in token.children]

            # If token_word is the character name, we want to keep the head and the children
            if token_word.lower() == character_name.lower():
                associated_words.append(token_head)
                associated_words.extend(token_children)

            # If token_word is an adjective, we want to keep it if the head is the character name
            if token.pos_ == 'ADJ' and token_head.lower() == character_name.lower():
                associated_words.append(token_word)

# Remove unwanted words: character name, stopwords, punctuation, and duplicates
associated_words = list(set(associated_words))
associated_words = [word for word in associated_words if word.lower() != character_name.lower()]
associated_words = [word for word in associated_words if word not in nlp.Defaults.stop_words]
associated_words = [word for word in associated_words if word.isalpha()]




# DRAFT NLP

In [2]:
import spacy
import spacy_experimental

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp_coref = spacy.load("en_coreference_web_trf")

nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

doc = nlp("Paul is brilliant. He is a journalist. He is amazing. This journalist loves pizzas. Alfred is a friend of Paul. He is a good friend.")

In [37]:
dict = doc.spans

In [None]:
dict

In [None]:
# extract first value from the dictionary


In [None]:
character_name = "Paul"

nb_clusters = int(len(dict)/2)
corref_clusters = []

for i in range(nb_clusters):
    corref_list = dict[f'coref_head_clusters_{i+1}']
    corref_list = [token.text for token in corref_list]
    corref_clusters.append(corref_list)

for corref_words in corref_clusters:
    if character_name in corref_words:
        corref_list = corref_words





In [None]:
list_of_words_1 = ['Paul', 'brilliant', 'journalist', 'amazing', 'pizzas', 'Alfred', 'friend', 'good']
list_of_words_2 = ['Cat', 'unknown', 'pursuit']

dict_of_lists = {'list_1': list_of_words_1, 'list_2': list_of_words_2}

nb_values = len(dict_of_lists)

empty_list = []

for i in range(nb_values): # add the words of each list to the empty list
    empty_list.extend(dict_of_lists[f'list_{i+1}'])

print(empty_list)

In [None]:
names

In [None]:
doc2 = nlp("Paul is a brilliant journalist. He is amazing. This journalist loves pizzas")
doc2.spans

In [None]:
import coreferee


nlp = spacy.load("en_core_web_sm") # load the spacy model in english
nlp.add_pipe('coreferee')
plot = " ".join(plot.split()) # make sure there is only one space between words
plot = plot.lower() # lowercase
doc = nlp(plot) # Tokenization

associated_words = []
characters_name_parts = character_name.lower().split()

sentences = [sent.text for sent in doc.sents]

for sentence in sentences:

    sentence_doc = nlp(sentence)
    sentence_text = sentence.lower()

    keep_sentence = (
        any(name_part in sentence_text for name_part in characters_name_parts) or
        any(token.pos_ == "PRON" and token._.in_coref for token in sentence_doc)
    )

    

In [None]:
sentences

In [None]:
example_text = villain_analysis['Plot'].iloc[0]
print("length of the plot: ", len(example_text))
preprocessed_text = preprocess_text(example_text)
print("length of the preprocessed plot: ", len(preprocessed_text))

In [None]:
preprocessed_text

# DRAFT FURTHER CLEANING

In [10]:
# Add the column 'Movie_genres' from the movies dataframe to the names_and_plot dataframe

movies = movies.rename(columns={'Wikipedia_movie_ID': 'Wikipedia_Movie_ID'})
names_plot_genres = pd.merge(names_and_plot, movies[['Wikipedia_Movie_ID', 'Movie_genres']], on='Wikipedia_Movie_ID', how='inner')

In [None]:
names_plot_genres

# DRAFT 1