In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import warnings
import string
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import umap
import ast

pd.options.mode.chained_assignment = None

In [3]:
def get_transition_matrix_efficient(df, word_to_index, d, initial_values=1):
    
    # Initialize the transition counts 
    transition_counts = np.ones((d, d), dtype=int) * initial_values
    
    # Loop through each utterance
    for utterance in df['utterance']:
        if type(utterance) != str:
            try:
                utterance = str(utterance)
            except:
                continue
            
        words = utterance.split()
        
        # Loop through each word and the previous word in the utterance
        for i in range(1, len(words)):
            word1 = words[i - 1]
            word2 = words[i]
            
            # Add to transition counts
            if word1 in word_to_index and word2 in word_to_index:
                index1 = word_to_index[word1]
                index2 = word_to_index[word2]
                transition_counts[index2, index1] += 1
                
    # Normalize the columns to construct the transition matrix
    column_sums = transition_counts.sum(axis=0, keepdims=True)
    transition_probabilities = transition_counts / column_sums
    
    return transition_probabilities, word_to_index


def tensor_trouble(df, word_to_index, d, initial_values=1):
    # Construct your emission matrix 
    
    # Initialize the tensor
    tensor = np.ones((d, 2, 2), dtype=int) * initial_values
    
    # For each episode 
    for episode, group in df.groupby('episode'):
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
            
        # Get words by utterance
        text = " ".join(group['utterance'].astype(str))
        words = text.split()
        
        # Get label for if it is host or not host
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        
        # Construct dictionary 
        for i in range(2, len(words)):
            word = words[i - 2]
            if word not in word_to_index:  # Check if the word exists in the dictionary
                continue  # Skip this iteration if the word is not found

            # get the roles of the current word and the next two words
            next_role = roles[i - 1]
            following_role = roles[i]

            # get the index of the current word
            current_index = word_to_index[word]

            # get the index of the roles
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1
            
            tensor[current_index, next_role_index, current_role_index] += 1
            
    return tensor, word_to_index


def make_emission(host, guest):
    emission = np.zeros((len(host), len(host), 2))
    emission[:, :, 0] = host
    emission[:, :, 1] = guest
    return np.swapaxes(emission, 0, 1)


def tensor_viterbi(obs, transition, emission, initial):
    
    # Initialize all parameters
    b_eps = 1e-25
    start_index = obs[0]
    obs = obs[1:]
    n = len(obs)
    d = transition.shape[0]
    eta = np.zeros((n, 2))
    backpointers = np.zeros((n, 2), dtype=int)
    eta[0] = np.log(initial) + np.log(emission[start_index, obs[1], :])
    obs = obs[1:]
    
    # Perform the viterbi algorith modified to work on tensors 
    for i in range(1, n - 1):
        
        b = emission[obs[i - 1], obs[i], :]
        
        if np.any(b == 0):
            
            zero_index = np.where(b == 0)
            b[zero_index] = b_eps
            
        eta_candidate = np.log(transition[obs[i - 1], :, :]) + np.log(b)[:, np.newaxis] + eta[i - 1][np.newaxis, :]
        eta[i] = np.max(eta_candidate, axis=1)
        backpointers[i] = np.argmax(eta_candidate, axis=1)
        
    state_sequence = np.zeros(n, dtype=int)
    state_sequence[-1] = np.argmax(eta[-1])
    
    for i in range(n - 2, -1, -1):
        state_sequence[i] = backpointers[i + 1, state_sequence[i + 1]]
        
    return state_sequence


def pad_to_match(a, b):
    """
    Pads the shorter array with its last element to match the length of the longer array.
    
    Args:
        a (np.array): First array for comparison.
        b (np.array): Second array for comparison.
        
    Returns:
        np.array, np.array: The two arrays modified to have equal lengths.
    """
    # If they are already the same size
    if len(a) == len(b):
        return a, b
    
    # If a needs padding
    elif len(a) > len(b):
        padding = np.full(len(a) - len(b), b[-1])
        b_padded = np.concatenate((b, padding))
        return a, b_padded
    
    # If b needs padding
    else:
        padding = np.full(len(b) - len(a), a[-1])
        a_padded = np.concatenate((a, padding))
        return a_padded, b
    
    
def filter_episodes(df, host_id=None):
    """
    Filters the episodes by host
    """
    
    df_filtered = df[df['host_id'] != -1]
    
    if host_id is None:  
        top_host = df_filtered.groupby('host_id')['episode'].nunique().idxmax()
        
    else:
        top_host = host_id
    
    # Filter the episodes by the specific host
    top_host_episodes = df_filtered[df_filtered['host_id'] == top_host]['episode'].unique()
    df_top_host_all_utterances = df[df['episode'].isin(top_host_episodes)]
    utterance_counts = df_top_host_all_utterances.groupby('episode')['utterance'].count()
    episodes_over_30 = utterance_counts[utterance_counts > 30].index
    df_top_host_over_30 = df_top_host_all_utterances[df_top_host_all_utterances['episode'].isin(episodes_over_30)]
    
    return df_top_host_over_30.reset_index(drop=True)

In [4]:
test_size = 0.2
embedding_path = 'archive/embed_df_with_hosts_filtered.csv'
utterance_path = 'archive/processed_utterances-2sp.csv'
umap_components = 10
initial_state_probabilities = np.array([.5, .5])

# Load the data
episode_embedding_df = pd.read_csv('archive/embed_df_with_hosts.csv')

# get the host_ids that have at least 100 episodes
host_ids = episode_embedding_df['host_id'].value_counts()[episode_embedding_df['host_id'].value_counts() > 100].index

episode_embedding_df = episode_embedding_df[episode_embedding_df['host_id'].isin(host_ids)]
# Convert the string into a list and then into an array of floats
episode_embedding_df['embedding'] = episode_embedding_df['embedding'].apply(lambda x: np.array(ast.literal_eval(x)))
utterance_df = pd.read_csv(utterance_path)


In [19]:
pd.set_option('display.max_colwidth', 1000)
display(utterance_df[['utterance', 'is_host']].head(10))

Unnamed: 0,utterance,is_host
0,good morning,True
1,good morning lulu,False
2,all right,True
3,whats the latest,True
4,well the latest is that the lawyer for the first whistleblower is tweeting that he is now representing a second whistleblower who he say ha firsthand knowledge of these event around the president and ukraine,False
5,the first whistleblower only had second and xxxxx knowledge,False
6,we also know that there are subpoena for white house document and state department document and personnel,False
7,we dont know how cooperative the administration will be with those request,False
8,we know that the former ambassador to ukraine marie xxxxx is expected to testify thursday,False
9,shes xxxxx removed because the president personal lawyer rudy giuliani and others did not feel she xxxxx loyal enough to the president,False


In [21]:
display(pd.read_csv('archive/utterances-2sp.csv')[['utterance', 'is_host']].head(15))

Unnamed: 0,utterance,is_host
0,"The impeachment inquiry picks up tomorrow where it left off Friday, with a subpoena sent to a White House that's used to ignoring congressional requests, a State Department that missed its own subpoena deadline and investigators poring over text messages between U.S. diplomats discussing what exactly the president wanted from Ukraine.",True
1,"Just this morning, the lawyer for the whistleblower, whose complaint is at the base of this inquiry, says he's representing another whistleblower.",True
2,There's are a lot of moving parts.,True
3,"Fortunately, NPR's Mara Liasson is here to help.",True
4,Good morning.,True
5,"Good morning, Lulu.",False
6,All right.,True
7,What's the latest?,True
8,"Well, the latest is that the lawyer for the first whistleblower is tweeting that he is now representing a second whistleblower, who he says has firsthand knowledge of these events around the president and Ukraine.",False
9,The first whistleblower only had second and thirdhand knowledge.,False


In [5]:

# get the number of unique hosts
host_ids = episode_embedding_df['host_id'].unique()
n_components = 6

episodes = episode_embedding_df['episode'].unique()
np.random.shuffle(episodes)
split_index = int(len(episodes) * 0.8)
train_episodes = episodes[:split_index]
test_episodes = episodes[split_index:]


X_train = episode_embedding_df[episode_embedding_df['episode'].isin(train_episodes)]['embedding']
X_test = episode_embedding_df[episode_embedding_df['episode'].isin(test_episodes)]['embedding']
y_train = episode_embedding_df[episode_embedding_df['episode'].isin(train_episodes)]['host_id']
y_test = episode_embedding_df[episode_embedding_df['episode'].isin(test_episodes)]['host_id']

# Convert the labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Convert the lists of floats into numpy arrays
X_train = np.array([np.array(x) for x in X_train])
X_test = np.array([np.array(x) for x in X_test])
# Reduce dimensions to 20 with UMAP
umap_reducer = umap.UMAP(n_components=umap_components)
X_reduced = umap_reducer.fit_transform(X_train)


print(f"Fitting GMM with {n_components} components")
gmm = GaussianMixture(n_components=n_components)
gmm.fit(X_reduced)

# Predict cluster labels
train_cluster_labels = gmm.predict(X_reduced)

# reduce the dimensions of the test set
X_test_reduced = umap_reducer.transform(X_test)

# Predict the cluster labels of the test set
test_cluster_labels = gmm.predict(X_test_reduced)

# # create dictionary to map train cluster labels to embeddings
train_cluster_to_embedding = {cluster: [] for cluster in set(train_cluster_labels)}
for cluster, embedding in zip(train_cluster_labels, X_train):
    train_cluster_to_embedding[cluster].append(embedding)

# create dictionary to map embeddings to episode ids
embedding_to_episode = {tuple(embedding): episode for embedding, episode in zip(episode_embedding_df['embedding'], episode_embedding_df['episode'])}

# create dictionary to map train cluster labels to episoded ids
train_cluster_to_episode = {cluster: [] for cluster in set(train_cluster_labels)}
for cluster, embedding in zip(train_cluster_labels, X_train):
    train_cluster_to_episode[cluster].append(embedding_to_episode[tuple(embedding)])

# create dictionary to map test cluster labels to embeddings
test_cluster_to_embedding = {cluster: [] for cluster in set(test_cluster_labels)}
for cluster, embedding in zip(test_cluster_labels, X_test):
    test_cluster_to_embedding[cluster].append(embedding)


# create dictionary to map test cluster labels to episoded ids
test_cluster_to_episode = {cluster: [] for cluster in set(test_cluster_labels)}
for cluster, embedding in zip(test_cluster_labels, X_test):
    test_cluster_to_episode[cluster].append(embedding_to_episode[tuple(embedding)])


Fitting GMM with 6 components


In [56]:
results_dict = {episode: [] for episode in episode_embedding_df[episode_embedding_df['episode'].isin(test_episodes)]['episode'].unique()}

# Iterate through each unique test cluster label
for cluster in set(test_cluster_labels):
    print(f"Cluster {cluster}")
    # Get the embeddings of the cluster
    cluster_embeddings = test_cluster_to_embedding[cluster]
    # Get the episode ids of the test episodes in the cluster
    cluster_test_episodes = test_cluster_to_episode[cluster]

    # Get the episode ids of the train episodes in the cluster
    cluster_train_episodes = train_cluster_to_episode[cluster]

    # Filter the utterance df to only include the episodes in the cluster for training and testing
    filtered_df = utterance_df[utterance_df['episode'].isin(cluster_train_episodes + cluster_test_episodes)][['is_host', 'episode', 'utterance']]

    # Split each utterance into a list of words and explode the DataFrame to get a row per word
    words_series = filtered_df['utterance'].str.split().explode()
    unique_words = set(words_series)
    word_frequencies = words_series.value_counts().to_dict()
    word_indices = {word: i for i, word in enumerate(unique_words)}

    # Filter the filter df to only include the episodes in the cluster for training and testing
    training_df = filtered_df[filtered_df['episode'].isin(cluster_train_episodes)]
    testing_df = filtered_df[filtered_df['episode'].isin(cluster_test_episodes)]

    print(len(training_df))
    print(len(testing_df))

    # Get the transition matrix for the training data
    transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(training_df[training_df['is_host'] == True], word_indices, len(unique_words))
    transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(training_df[training_df['is_host'] == False], word_indices, len(unique_words))

    # Get the tensor for the training data
    tensor, word_to_index = tensor_trouble(training_df, word_indices, len(unique_words))

    # Normalize the tensor
    tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

    # Get the emission matrix for the training data
    emission_matrix = make_emission(transition_matrix_host, transition_matrix_guest)

    for episode in cluster_test_episodes:
        if episode not in results_dict:
            results_dict[episode] = []
        test_episode_df = testing_df[testing_df['episode'] == episode]
        test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
        test_episode_df = test_episode_df.explode('utterance')
        test_label = test_episode_df['is_host'].to_numpy().astype(int)
        test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
        test_episode_words = [word for word in test_episode_words if word != 'nan']
        test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
        obs = [0] + test_episode_word_indices  # Add start token index (e.g., 0) if your Viterbi expects it



        state_sequence = tensor_viterbi(obs, tensor_normalized, emission_matrix, initial_state_probabilities)
        # print(len(state_sequence))
        test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

        # Now, calculate the accuracy
        accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))

        # Switch the labels if accuracy is less than 0.5
        if accuracy < 0.5:
            accuracy = 1 - accuracy
            state_sequence_padded = 1 - state_sequence_padded
        
        assume_0 = (test_label_padded == 1).astype(int)
        assume_0_accuracy = np.mean(assume_0)

        # Switch the labels if accuracy is less than 0.5
        if assume_0_accuracy < 0.5:
            assume_0_accuracy = 1 - assume_0_accuracy
            assume_0 = 1 - assume_0

        # Calculate the confusion matrices
        confusion = confusion_matrix(test_label_padded, state_sequence_padded)
        assume_0_confusion = confusion_matrix(test_label_padded, assume_0)

        # Append the confusion matrices to the results dictionary
        results_dict[episode].append({
            'accuracy': accuracy,
            'confusion': confusion,
            'assume_0_accuracy': assume_0_accuracy,
            'assume_0_confusion': assume_0_confusion,
            'method': 'Clustering'
        })

# for host_id in utterance_df['host_id'].unique():
#     print(f"Host {host_id}")
#     # Filter the utterance and embedding dataframes for this specific host
#     host_utterance_df = filter_episodes(utterance_df, host_id=host_id)

#     # Get the episodes related to this host
#     host_episodes = host_utterance_df['episode'].unique()

#     # Process the utterances as before
#     words_series = host_utterance_df['utterance'].str.split().explode()
#     unique_words = set(words_series)
#     word_frequencies = words_series.value_counts().to_dict()
#     word_indices = {word: i for i, word in enumerate(unique_words)}
    
    

#     training_df = host_utterance_df[host_utterance_df['episode'].isin(train_episodes)]
#     print(len(training_df))
#     testing_df = host_utterance_df[host_utterance_df['episode'].isin(test_episodes)]
#     print(len(testing_df))

#     # Calculate transition matrices and tensor for training data
#     transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(training_df[training_df['is_host'] == True], word_indices, len(unique_words))
#     transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(training_df[training_df['is_host'] == False], word_indices, len(unique_words))
#     tensor, word_to_index = tensor_trouble(training_df, word_indices, len(unique_words))
#     tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

#     # Emission matrix for training data
#     emission_matrix = make_emission(transition_matrix_host, transition_matrix_guest)

#     # Iterate through each episode to get all test words
#     for episode in testing_df['episode'].unique():
#         test_episode_df = testing_df[testing_df['episode'] == episode][['is_host', 'utterance']]
#         test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
#         test_episode_df = test_episode_df.explode('utterance')
#         test_label = test_episode_df['is_host'].to_numpy().astype(int)
#         test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
#         test_episode_words = [word for word in test_episode_words if word != 'nan']
#         test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
        
#         if not test_episode_word_indices:  # If the list is empty, skip to the next iteration
#             print(f"No words processed for episode {episode}, skipping...")
#             continue
        
#         obs = [0] + test_episode_word_indices  # Include start token

#         # Get state sequence from viterbi algorithm 
#         state_sequence = tensor_viterbi(obs, tensor_normalized, emission_matrix, initial_state_probabilities)
#         test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

#         accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))

#         # Calculate the accuracy 
#         if accuracy < 0.5:
#             accuracy = 1 - accuracy
#             state_sequence_padded = 1 - state_sequence_padded

#         assume_0 = (test_label_padded == 1).astype(int)
#         assume_0_accuracy = np.mean(assume_0)

#         # Take the larger of the accuracies since binary 
#         if assume_0_accuracy < 0.5: 
#             assume_0_accuracy = 1 - assume_0_accuracy
#             assume_0 = 1 - assume_0

#         # Create the confusion matrix
#         confusion = confusion_matrix(test_label_padded, state_sequence_padded)
#         assume_0_confusion = confusion_matrix(test_label_padded, assume_0)

#         # For new episode add to results dict 
#         if episode not in results_dict or 'Host' not in [result['method'] for result in results_dict[episode]]:
#             results_dict[episode].append({
#                 'accuracy': accuracy,
#                 'confusion': confusion,
#                 'assume_0_accuracy': assume_0_accuracy,
#                 'assume_0_confusion': assume_0_confusion,
#                 'method': 'Host'
#             })
#         else:
#             print(f"Duplicate or existing entry found for episode {episode} under Host method.")

# # Save the results dictionary to a file
# with open('archive/results_dict.json', 'w') as f:
#     f.write(str(results_dict))


Cluster 0
266939
63963
Cluster 1
128443
31958
Cluster 2
132281
34792
Cluster 3
43765
10686
Cluster 4
136757
34008
Cluster 5
158971
39340


In [24]:
# cluster = 1

# cluster_embeddings = test_cluster_to_embedding[cluster]
# # Get the episode ids of the test episodes in the cluster
# cluster_test_episodes = test_cluster_to_episode[cluster]

# # Get the episode ids of the train episodes in the cluster
# cluster_train_episodes = train_cluster_to_episode[cluster]

# # Filter the utterance df to only include the episodes in the cluster for training and testing
# filtered_df = utterance_df[utterance_df['episode'].isin(cluster_train_episodes + cluster_test_episodes)][['is_host', 'episode', 'utterance']]

# # Split each utterance into a list of words and explode the DataFrame to get a row per word
# words_series = filtered_df['utterance'].str.split().explode()
# unique_words = set(words_series)
# word_frequencies = words_series.value_counts().to_dict()
# word_indices = {word: i for i, word in enumerate(unique_words)}

# # Filter the filter df to only include the episodes in the cluster for training and testing
# training_df = filtered_df[filtered_df['episode'].isin(cluster_train_episodes)]
# testing_df = filtered_df[filtered_df['episode'].isin(cluster_test_episodes)]

# print(len(training_df))
# print(len(testing_df))

# # Get the transition matrix for the training data
# transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(training_df[training_df['is_host'] == True], word_indices, len(unique_words))
# transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(training_df[training_df['is_host'] == False], word_indices, len(unique_words))

# # Get the tensor for the training data
# tensor, word_to_index = tensor_trouble(training_df, word_indices, len(unique_words))

# # Normalize the tensor
# tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

# # Get the emission matrix for the training data
# emission_matrix = make_emission(transition_matrix_host, transition_matrix_guest)

episode = cluster_test_episodes[6]


test_episode_df = testing_df[testing_df['episode'] == episode]
test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
test_episode_df = test_episode_df.explode('utterance')
test_label = test_episode_df['is_host'].to_numpy().astype(int)
test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
test_episode_words = [word for word in test_episode_words if word != 'nan']
test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
obs = [0] + test_episode_word_indices  # Add start token index (e.g., 0) if your Viterbi expects it



state_sequence = tensor_viterbi(obs, tensor_normalized, emission_matrix, initial_state_probabilities)
# print(len(state_sequence))
test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

# Now, calculate the accuracy
accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))

# Switch the labels if accuracy is less than 0.5
if accuracy < 0.5:
    accuracy = 1 - accuracy
    state_sequence_padded = 1 - state_sequence_padded

assume_0 = (test_label_padded == 1).astype(int)
assume_0_accuracy = np.mean(assume_0)

# Switch the labels if accuracy is less than 0.5
if assume_0_accuracy < 0.5:
    assume_0_accuracy = 1 - assume_0_accuracy
    assume_0 = 1 - assume_0

# Calculate the confusion matrices
confusion = confusion_matrix(test_label_padded, state_sequence_padded)
assume_0_confusion = confusion_matrix(test_label_padded, assume_0)


In [12]:
np.set_printoptions(threshold=10000)

In [27]:
accuracy

0.8462897526501767

In [57]:
eps = results_dict.keys()
cluster_confusion = np.array([[0, 0], [0, 0]])
# host_confusion = np.array([[0, 0], [0, 0]])

cluster_accuracy = 0
# host_accuracy = 0

# Find the confusion matrices for cluster and host methods 
for e in eps:  
    if e not in results_dict or len(results_dict[e]) == 0:
        print(f"Episode {e} not found in results_dict")
        continue
    cluster = results_dict[e][0]
    # if len(results_dict[e]) > 1:
    #     host = results_dict[e][1]
        # host_confusion += host['confusion']
        # host_accuracy += host['accuracy']

    cluster_confusion += cluster['confusion']
    cluster_accuracy += cluster['accuracy']
# results_dict[421][0]['confusion']


Episode 60267 not found in results_dict


In [58]:
# normalize the confusion matrices
cluster_confusion = cluster_confusion / cluster_confusion.sum()
# host_confusion = host_confusion / host_confusion.sum()

print(f"Cluster accuracy: {cluster_accuracy / len(eps)}")
# print(f"Host accuracy: {host_accuracy / len(eps)}")
print(f"Cluster confusion matrix: {cluster_confusion}")
# print(f"Host confusion matrix: {host_confusion}")

Cluster accuracy: 0.7758748904872238
Cluster confusion matrix: [[0.69107441 0.01406547]
 [0.21694967 0.07791046]]
