In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
import ipywidgets as widgets

In [None]:
from IPython.display import clear_output
import json

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn import metrics
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count
from utils.clustering_evaluation import get_characters_with_tv_trop_info, variation_of_information, group_labels_by_clusters

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )

In [None]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

characters_and_movies = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies = characters_and_movies[characters_and_movies['revenue'] > 1e7]
characters_and_movies = characters_and_movies #[['wiki_id', 'title', 'character', 'cluster']]

In [None]:
characters_and_movies[['wiki_id', 'title', 'character', 'cluster']]

In [None]:
lor_characters = characters_and_movies[characters_and_movies['title'].str.contains('Lord of the Rings')][['wiki_id', 'title', 'character', 'cluster', 'adj', 'active', 'patient']]
lor_characters

In [None]:
topics_dict = json.load(open('data/words_by_topic.json', 'r'))
lda_components = np.load('data/lda_components.npy')
with open('topics_description.txt', 'r') as f:
    topics_names = f.read().splitlines() 

In [None]:
def topic_distribution(cluster, lda_components, topics_names):
    features = ['adjective', 'active verb', 'patient verb']

    cluster_components = lda_components[cluster]
    top_topics = np.argsort(cluster_components)[-1:-21:-1]
    topic_to_probability = {}
    for i in top_topics:
        feature = features[i // 200]
        topic_to_probability[feature + ': ' + topics_names[i % 200]] = cluster_components[i]
    return topic_to_probability

In [None]:
len(topic_distribution(0, lda_components, topics_names))

In [None]:
for i in set(lor_characters['cluster'].values):
    print(lor_characters[lor_characters['cluster']==i])
    topic_distr = topic_distribution(i, lda_components, topics_names)
    y_pos = np.arange(20)
    plt.barh(y_pos, list(topic_distr.values()))
    plt.yticks(y_pos, labels=list(topic_distr.keys()))
    plt.gca().invert_yaxis()
    plt.xscale('log')
    plt.show()