In [1]:
import os
import json
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import adjusted_rand_score
from torch.utils.data import DataLoader
from torch.optim import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

In [2]:
bags_folder = '../data/char_bags' #Bag of words json file from build_char_word_bags.py output
tropes_file = '../data/tropes/tvtropes.clusters.txt' #tropes from CMU dataset

In [3]:
def load_character_bags(json_folder):
    character_bags = {}
    for filename in os.listdir(json_folder):
        if filename.endswith('.json'):
            with open(os.path.join(json_folder, filename)) as f:
                data = json.load(f)
                for character in data:
                    name = character['name']
                    bag_words = { "agent verb": [], "patient verb": [], "attribute": []}
                    for pair in character['bag']:
                        bag_words[pair[0]].append(pair[1])
                    
                    character_bags[name] = bag_words
    return character_bags

#load character bags of words
char_full = load_character_bags(bags_folder)

In [4]:
agent_texts = [" ".join(data["agent verb"]) for data in char_full.values()]
patient_texts = [" ".join(data["patient verb"]) for data in char_full.values()]
attribute_texts = [" ".join(data["attribute"]) for data in char_full.values()]

agent_vectorizer = CountVectorizer()
patient_vectorizer = CountVectorizer()
attribute_vectorizer = CountVectorizer()

agent_matrix = agent_vectorizer.fit_transform(agent_texts)
patient_matrix = patient_vectorizer.fit_transform(patient_texts)
attribute_matrix = attribute_vectorizer.fit_transform(attribute_texts)

agent_features = agent_vectorizer.get_feature_names_out()
patient_features = patient_vectorizer.get_feature_names_out()
attribute_features = attribute_vectorizer.get_feature_names_out()

combined_features = list(agent_features) + list(patient_features) + list(attribute_features)
dt_matrix = hstack([agent_matrix, patient_matrix, attribute_matrix])

print("Number of agent features:", len(agent_vectorizer.get_feature_names_out()))
print("Number of patient features:", len(patient_vectorizer.get_feature_names_out()))
print("Number of attribute features:", len(attribute_vectorizer.get_feature_names_out()))
print("Document-term matrix shape:\n", dt_matrix.shape)

Number of agent features: 9291
Number of patient features: 8627
Number of attribute features: 16799
Document-term matrix shape:
 (41529, 34717)


In [10]:
def load_tropes(tropes_file):
    trope_dict = {}
    with open(tropes_file) as f:
        for line in f:
            trope, char_data = line.split("\t")
            char_info = json.loads(char_data)
            name = char_info['char']
            trope_dict[name] = trope
    return trope_dict

#load character tropes
tropes = load_tropes(tropes_file)
print(f"Sample size: {len(tropes)}")
print(f"Number of unique tropes: {len(set(tropes.values()))}")

Sample size: 434
Number of unique tropes: 72


In [14]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=72) 
persona_distributions = lda.fit_transform(dt_matrix)

for idx, topic in enumerate(lda.components_):
    top_indices = topic.argsort()[-5:][::-1]  # Top 5 words for this topic
    print(f"Persona {idx}: {[combined_features[i] for i in top_indices]}")

Persona 0: ['form', 'crush', 'meet', 'question', 'help']
Persona 1: ['officer', 'take', 'investigate', 'have', 'discover']
Persona 2: ['have', 'visit', 'lover', 'dump', 'have']
Persona 3: ['old', 'father', 'year', 'sister', 'mary']
Persona 4: ['have', 'king', 'advise', 'ben', 'brother']
Persona 5: ['friend', 'pregnant', 'widow', 'worker', 'have']
Persona 6: ['murder', 'kill', 'drug', 'star', 'dealer']
Persona 7: ['joe', 'arrest', 'have', 'agree', 'tell']
Persona 8: ['new', 'assist', 'focus', 'york', 'find']
Persona 9: ['little', 'arrive', 'first', 'have', 'leave']
Persona 10: ['girlfriend', 'have', 'be', 'go', 'tell']
Persona 11: ['join', 'help', 'contact', 'find', 'kill']
Persona 12: ['go', 'tell', 'tell', 'have', 'find']
Persona 13: ['miss', 'sister', 'pregnant', 'host', 'leave']
Persona 14: ['family', 'johnny', 'play', 'al', 'be']
Persona 15: ['wealthy', 'rescue', 'willing', 'stick', 'end']
Persona 16: ['queen', 'refuse', 'live', 'have', 'beat']
Persona 17: ['discover', 'find', 'go'

In [26]:
def get_overlapping_characters(character_bags, trope_dict):
    common_characters = set(character_bags.keys()).intersection(trope_dict.keys())
    filtered_bags = {name: character_bags[name] for name in common_characters}
    filtered_tropes = {name: trope_dict[name] for name in common_characters}
    return filtered_bags, filtered_tropes

#update the characaters and tropes to only include ones whose trope we know and who we have the bag of words for
char_filtered, tropes_filtered = get_overlapping_characters(char_full, tropes)

In [50]:
cluster_dict = {i: [] for i in range(len(set(tropes.values())))}

print("Prediction (Only showing characters who exist in tvtropes.clusters.txt:")
full_pred = persona_distributions.argmax(axis=1)
for name in list(char_filtered.keys()): 
    label = full_pred[list(char_full.keys()).index(name)]
    cluster_dict[label].append(name)
for cluster, characters in cluster_dict.items():
    print(f"Cluster {cluster}: {', '.join(characters)}")
print("\nGroundtruth:")
grouped_by_trope = {}
for name, trope in tropes_filtered.items():
    if trope not in grouped_by_trope:
        grouped_by_trope[trope] = []
    grouped_by_trope[trope].append(name)
for trope, characters in grouped_by_trope.items():
    print(f"{trope}: {', '.join(characters)}")

Prediction (Only showing characters who exist in tvtropes.clusters.txt:
Cluster 0: 
Cluster 1: Ted "Theodore" Logan, Han, Belle
Cluster 2: Karen
Cluster 3: 
Cluster 4: Obi-Wan Kenobi, Agatha, Guy
Cluster 5: Rooster, Leonard
Cluster 6: Dreyfus, George McFly, Hannibal Lecter
Cluster 7: Abu, Lucius Fox
Cluster 8: 
Cluster 9: Ratchet, T. E. Lawrence, Dennis
Cluster 10: 
Cluster 11: Sol
Cluster 12: Sharpay Evans, Beth, Conan the Barbarian, Dorian Gray, Nick, Beck, Elle Woods, Van Wilder, Max Goldman, Mia Thermopolis
Cluster 13: 
Cluster 14: Bo
Cluster 15: Anakin Skywalker
Cluster 16: Timothy, Apollo Creed, Cindy Campbell
Cluster 17: James Norrington, Sheriff Woody, Max Rockatansky, Ed, James Bond, Wolverine, Frank Martin
Cluster 18: Indiana Jones
Cluster 19: Eli
Cluster 20: Riley Poole, Sam Flynn
Cluster 21: Blade
Cluster 22: Brad
Cluster 23: Arthur Bach, Ike Clanton, Garth, Carl Fredricksen, Doctor Doom
Cluster 24: Jim
Cluster 25: Mia
Cluster 26: 
Cluster 27: Walter Burns, Tony Stark, Shau