In [19]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [20]:
# change this to the path where the doc with the extracted personas is 
personas_df = pd.read_csv("../war_action_movies_with_persons.csv")
personas_df.head()

Unnamed: 0,name,release_date,revenue,runtime,languages,country,genres,plot,Persona
0,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...","{'Williams': {'lands'}, 'Arthur': {'war'}, 'As..."
1,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,"{'Ben Cameron': {'guilty', 'leading', 'holds',..."
2,Barry Lyndon,1975.0,20000000.0,187.0,"French , English , German","United States of America, United Kingdom","Costume drama, Film adaptation, Period piece, ...",:By What Means Redmond Barry Acquired the Sty...,"{'Redmond Barry': {'decides', 'reveals', 'refu..."
3,Buffy the Vampire Slayer,1992.0,16624456.0,86.0,English,United States of America,"Action, Horror, Comedy, Horror Comedy",Buffy Summers is introduced as a stereotypica...,"{'Buffy Summers': {'confronts', 'Slayer', 'rea..."
4,Braveheart,1995.0,211409945.0,175.0,"French , Latin , English , Gaelic",United States of America,"Biography, Adventure, History, War film, Actio...","In the 13th century, after several years of po...","{'Longshanks': {'implores', 'seeks', 'sees', '..."


All of the personas of one film are included in the row of the dataframe corresponding to that film. We want to have one row for each persona. We also want to have one column with the name of the persona and another with the associated words (without any punctuation marks)

In [21]:
# define a function to split the personas to the different lines

def split_personas(dataframe):
    new_rows = []
    # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        persona_dict = ast.literal_eval(row["Persona"]) # take all of the personas in a film
        for persona, actions in persona_dict.items():
            new_row = row.drop("Persona") 
            persona_series = pd.Series(
                {
                    "persona": persona,  # Add the persona name
                    "actions": actions,  # Add the actions
                }
            )
            new_row = pd.concat([new_row, persona_series])
            new_rows.append(new_row)

    # Convert the list of new rows into a new DataFrame
    dataframe = pd.DataFrame(new_rows)

    dataframe["actions_clean"] = dataframe["actions"].map(
        lambda x: re.sub(r"[{}':,.!?]", "", " ".join(x))
    )
    dataframe["actions_clean"] = dataframe["actions_clean"].map(
        lambda x: x.lower()
    )
    dataframe = dataframe[
    dataframe["actions_clean"].str.split().str.len() > 1
]
    return dataframe

In [22]:
personas_df = split_personas(personas_df)

personas_df.head()

Unnamed: 0,name,release_date,revenue,runtime,languages,country,genres,plot,persona,actions,actions_clean
2,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash Williams,"{rushes, finds, ducks, grabs, crashes, grows, ...",rushes finds ducks grabs crashes grows tries k...
3,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash,"{buries, leads, starts, finds, slays, kills, d...",buries leads starts finds slays kills demands ...
4,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Ben Cameron,"{guilty, finds, make, leading, holds, idolizes...",guilty finds make leading holds idolizes rescu...
5,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Stoneman,"{younger, leaves, go}",younger leaves go
6,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Elsie Stoneman,"{leave, go, wants, looking, gains, warns, jump...",leave go wants looking gains warns jump frees ...


In [23]:
# apply filters to select only certain types of movies (country, genre ...)
personas_df = personas_df[
    personas_df["country"].str.contains("United States of America", na=False)
]
# Display the new DataFrame
personas_df.head()

Unnamed: 0,name,release_date,revenue,runtime,languages,country,genres,plot,persona,actions,actions_clean
2,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash Williams,"{rushes, finds, ducks, grabs, crashes, grows, ...",rushes finds ducks grabs crashes grows tries k...
3,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash,"{buries, leads, starts, finds, slays, kills, d...",buries leads starts finds slays kills demands ...
4,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Ben Cameron,"{guilty, finds, make, leading, holds, idolizes...",guilty finds make leading holds idolizes rescu...
5,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Stoneman,"{younger, leaves, go}",younger leaves go
6,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Elsie Stoneman,"{leave, go, wants, looking, gains, warns, jump...",leave go wants looking gains warns jump frees ...


In [None]:
# define a function to do all of the lda 
# input dataframe 

In [41]:
# define a function to do all of the lda 
# input dataframe, number of clusters
def run_lda(dataframe, nb_clusters):
    # LDA function from sklearn takes a list with the "documents" as input -> transform each persona in the dataframe to an entry in the document list
    documents_list = dataframe["actions_clean"].tolist()
    words = [word for string in documents_list for word in string.split()]
    word_counts = Counter(words)
    unique_words = [word for word, count in word_counts.items() if count == 1]
    updated_documents = [
    " ".join([word for word in string.split() if word not in unique_words])
    for string in documents_list
    ]
    documents_list = [string for string in updated_documents if len(string.split()) > 1]

    # tokenize and vectorize the document list
    tokenizer = RegexpTokenizer(r"\w+")
    vectorizer = CountVectorizer()  
    tfidf = TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 1),
        tokenizer=tokenizer.tokenize,
    )
    train_data = tfidf.fit_transform(documents_list)

    # Create LDA object
    model = LatentDirichletAllocation(n_components=nb_clusters)

    # Fit and Transform SVD model on data
    lda_matrix = model.fit_transform(train_data)
    # Get Components
    lda_components = model.components_
    feature_names = tfidf.get_feature_names_out()
    top_words_per_topic = {}
    for topic_idx, topic in enumerate(lda_components):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 words for each topic
        top_words_per_topic[topic_idx] = top_words

    # Get Document-Topic Distribution
    doc_topic_distribution = lda_matrix  # Each document's topic distribution

    # Return results
    return {
        "document_topic_distribution": doc_topic_distribution,
        "topic_word_distribution": top_words_per_topic,
        "lda_model": model,
        "tfidf": tfidf,
    }



In [42]:

# run the LDA
lda_results = run_lda(personas_df, 20)

topic_word_distribution = lda_results["topic_word_distribution"]

for index, top_words in topic_word_distribution.items():
    print(f"Topic {index}: {', '.join(top_words)}")




Topic 0: says, fall, fly, tells, returns, man, manages, taking, work, discovers
Topic 1: leads, marries, track, trying, real, lives, refuses, died, watches, spy
Topic 2: dead, explains, suggests, hiding, allows, picks, rides, plays, tells, look
Topic 3: begins, finds, end, holds, enter, takes, kidnaps, responsible, love, able
Topic 4: tries, discovers, uses, follow, falls, advises, pulls, owns, missing, pull
Topic 5: begin, reveals, dies, tells, set, runs, remains, survives, takes, refuses
Topic 6: try, plan, revealed, late, carrying, wealthy, concludes, steal, walk, locate
Topic 7: shoots, kills, warns, meets, knocks, does, tells, confronts, kisses, changes
Topic 8: alive, admits, come, finds, commits, travels, threatens, teaches, kills, befriends
Topic 9: learn, realize, private, beautiful, survived, working, told, kill, knew, corrupt
Topic 10: making, send, convinces, keeps, looks, attend, carry, surprised, good, wears
Topic 11: named, wants, tells, old, calls, leave, agent, seeks, 

In [43]:
def match_personas_topic(dataframe, tfidf, model):

    for index, row in dataframe.iterrows():
        wordlist = dataframe.loc[index, "actions_clean"]
        wordlist = wordlist.split()
        word = wordlist[0]
        word_index = tfidf.vocabulary_.get(word)

        if word_index is not None:
        # Get the topic-word distribution from the LDA model
            topic_word_distribution = model.components_  # Shape (n_topics, n_words)

        # Get the distribution of the word across all topics
            word_topic_distribution = topic_word_distribution[:, word_index]

        # Find the topic with the highest probability for the word
            best_topic = np.argmax(word_topic_distribution)
            dataframe.loc[index, "topic"] = best_topic

In [None]:
personas_df["topic"] = None

match_personas_topic(personas_df, lda_results["tfidf"], lda_results["lda_model"])
personas_df.head()

Unnamed: 0,name,release_date,revenue,runtime,languages,country,genres,plot,persona,actions,actions_clean,topic
2,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash Williams,"{rushes, finds, ducks, grabs, crashes, grows, ...",rushes finds ducks grabs crashes grows tries k...,0
3,Army of Darkness,1992.0,21502796.0,81.0,English,United States of America,"Cult, Horror, Stop motion, Costume drama, Acti...","After being pulled through a time portal, Ash ...",Ash,"{buries, leads, starts, finds, slays, kills, d...",buries leads starts finds slays kills demands ...,16
4,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Ben Cameron,"{guilty, finds, make, leading, holds, idolizes...",guilty finds make leading holds idolizes rescu...,10
5,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Stoneman,"{younger, leaves, go}",younger leaves go,15
6,The Birth of a Nation,1915.0,50000000.0,190.0,"Silent film, English",United States of America,"Silent film, Indie, Costume drama, Epic, Black...",The film follows two juxtaposed families: the...,Elsie Stoneman,"{leave, go, wants, looking, gains, warns, jump...",leave go wants looking gains warns jump frees ...,11


(4478,)
29
