In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import warnings
import string
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import umap
import ast

In [10]:
def get_transition_matrix_efficient(df, word_to_index, d, initial_values=1):
    transition_counts = np.ones((d, d), dtype=int) * initial_values
    for utterance in df['utterance']:
        if type(utterance) != str:
            try:
                utterance = str(utterance)
            except:
                continue
        words = utterance.split()
        for i in range(1, len(words)):
            word1 = words[i - 1]
            word2 = words[i]
            if word1 in word_to_index and word2 in word_to_index:
                index1 = word_to_index[word1]
                index2 = word_to_index[word2]
                transition_counts[index2, index1] += 1
    column_sums = transition_counts.sum(axis=0, keepdims=True)
    transition_probabilities = transition_counts / column_sums
    return transition_probabilities, word_to_index

def tensor_trouble(df, word_to_index, d, initial_values=1):
    tensor = np.ones((d, 2, 2), dtype=int) * initial_values
    for episode, group in df.groupby('episode'):
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
        text = " ".join(group['utterance'].astype(str))
        words = text.split()
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        for i in range(2, len(words)):
            word = words[i - 2]
            if word not in word_to_index:  # Check if the word exists in the dictionary
                continue  # Skip this iteration if the word is not found

            # get the roles of the current word and the next two words
            next_role = roles[i - 1]
            following_role = roles[i]

            # get the index of the current word
            current_index = word_to_index[word]

            # get the index of the roles
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1

            
            tensor[current_index, next_role_index, current_role_index] += 1
    return tensor, word_to_index

def make_emission(host, guest):
    emission = np.zeros((len(host), len(host), 2))
    emission[:, :, 0] = host
    emission[:, :, 1] = guest
    return np.swapaxes(emission, 0, 1)

def tensor_viterbi(obs, transition, emission, initial):
    b_eps = 1e-25
    start_index = obs[0]
    obs = obs[1:]
    n = len(obs)
    d = transition.shape[0]
    eta = np.zeros((n, 2))
    backpointers = np.zeros((n, 2), dtype=int)
    eta[0] = np.log(initial) + np.log(emission[start_index, obs[1], :])
    obs = obs[1:]
    for i in range(1, n - 1):
        b = emission[obs[i - 1], obs[i], :]
        if np.any(b == 0):
            zero_index = np.where(b == 0)
            b[zero_index] = b_eps
        eta_candidate = np.log(transition[obs[i - 1], :, :]) + np.log(b)[:, np.newaxis] + eta[i - 1][np.newaxis, :]
        eta[i] = np.max(eta_candidate, axis=1)
        backpointers[i] = np.argmax(eta_candidate, axis=1)
    state_sequence = np.zeros(n, dtype=int)
    state_sequence[-1] = np.argmax(eta[-1])
    for i in range(n - 2, -1, -1):
        state_sequence[i] = backpointers[i + 1, state_sequence[i + 1]]
    return state_sequence

def pad_to_match(a, b):
    """
    Pads the shorter array with its last element to match the length of the longer array.
    
    Args:
        a (np.array): First array for comparison.
        b (np.array): Second array for comparison.
        
    Returns:
        np.array, np.array: The two arrays modified to have equal lengths.
    """
    if len(a) == len(b):
        return a, b
    elif len(a) > len(b):
        padding = np.full(len(a) - len(b), b[-1])
        b_padded = np.concatenate((b, padding))
        return a, b_padded
    else:
        padding = np.full(len(b) - len(a), a[-1])
        a_padded = np.concatenate((a, padding))
        return a_padded, b

In [8]:
test_size = 0.2
embedding_path = 'archive/embed_df_with_hosts_filtered.csv'
utterance_path = 'archive/processed_utterances-2sp.csv'
umap_components = 10
initial_state_probabilities = np.array([.5, .5])

df = pd.read_csv(embedding_path)

# get the number of unique hosts
host_ids = df['host_id'].unique()
n_components = len(host_ids) // 4


X, y = df['embedding'].values, df['host_id'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Convert the labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Convert the lists of floats into numpy arrays
X_train = np.array([np.array(x) for x in X_train])
X_test = np.array([np.array(x) for x in X_test])
# Reduce dimensions to 20 with UMAP
umap_reducer = umap.UMAP(n_components=umap_components)
X_reduced = umap_reducer.fit_transform(X_train)


print(f"Fitting GMM with {n_components} components")
gmm = GaussianMixture(n_components=n_components)
gmm.fit(X_reduced)

# Predict cluster labels
train_cluster_labels = gmm.predict(X_reduced)

# reduce the dimensions of the test set
X_test_reduced = umap_reducer.transform(X_test)

# Predict the cluster labels of the test set
test_cluster_labels = gmm.predict(X_test_reduced)

# create dictionary to map train cluster labels to embeddings
train_cluster_to_embedding = {cluster: [] for cluster in set(train_cluster_labels)}
for cluster, embedding in zip(train_cluster_labels, X_train):
    train_cluster_to_embedding[cluster].append(embedding)

# create dictionary to map embeddings to episode ids
embedding_to_episode = {tuple(embedding): episode for embedding, episode in zip(df['embedding'], df['episode'])}

# create dictionary to map train cluster labels to episoded ids
train_cluster_to_episode = {cluster: [] for cluster in set(train_cluster_labels)}
for cluster, embedding in zip(train_cluster_labels, X_train):
    train_cluster_to_episode[cluster].append(embedding_to_episode[tuple(embedding)])

# create dictionary to map test cluster labels to embeddings
test_cluster_to_embedding = {cluster: [] for cluster in set(test_cluster_labels)}
for cluster, embedding in zip(test_cluster_labels, X_test):
    test_cluster_to_embedding[cluster].append(embedding)


# create dictionary to map test cluster labels to episoded ids
test_cluster_to_episode = {cluster: [] for cluster in set(test_cluster_labels)}
for cluster, embedding in zip(test_cluster_labels, X_test):
    test_cluster_to_episode[cluster].append(embedding_to_episode[tuple(embedding)])

results_dict = {episode: [] for episode in df['episode'].unique()}

 # Load the utterance df
utterance_df = pd.read_csv(utterance_path)


# Iterate through each unique test cluster label
for cluster in set(test_cluster_labels):
    # Get the embeddings of the cluster
    cluster_embeddings = test_cluster_to_embedding[cluster]
    # Get the episode ids of the test episodes in the cluster
    test_episodes = test_cluster_to_episode[cluster]

    # Get the episode ids of the train episodes in the cluster
    train_episodes = train_cluster_to_episode[cluster]

    # Filter the utterance df to only include the episodes in the cluster for training and testing
    filtered_df = utterance_df[utterance_df['episode'].isin(train_episodes + test_episodes)][['utterance']]

    # Split each utterance into a list of words and explode the DataFrame to get a row per word
    words_series = filtered_df['utterance'].str.split().explode()
    unique_words = set(words_series)
    word_frequencies = words_series.value_counts().to_dict()
    word_indices = {word: i for i, word in enumerate(unique_words)}

    # Filter the filter df to only include the episodes in the cluster for training and testing
    training_df = filtered_df[filtered_df['episode'].isin(train_episodes)][['is_host', 'utterance']]
    testing_df = filtered_df[filtered_df['episode'].isin(test_episodes)][['is_host', 'utterance']]

    # Get the transition matrix for the training data
    transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(training_df[training_df['is_host'] == True], word_indices, len(unique_words))
    transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(training_df[training_df['is_host'] == False], word_indices, len(unique_words))

    # Get the tensor for the training data
    tensor, word_to_index = tensor_trouble(training_df, word_indices, len(unique_words))

    # Normalize the tensor
    tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

    # Get the emission matrix for the training data
    emission_matrix = make_emission(transition_matrix_host, transition_matrix_guest)

    for episode in test_episodes:
        test_episode_df = testing_df[testing_df['episode'] == episode]
        test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
        test_episode_df = test_episode_df.explode('utterance')
        test_label = test_episode_df['is_host'].to_numpy().astype(int)
        test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
        test_episode_words = [word for word in test_episode_words if word != 'nan']
        test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
        obs = [0] + test_episode_word_indices  # Add start token index (e.g., 0) if your Viterbi expects it

        state_sequence = tensor_viterbi(obs, tensor_normalized, emission_matrix, initial_state_probabilities)
        # print(len(state_sequence))
        test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

        # Now, calculate the accuracy
        accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))

        # Switch the labels if accuracy is less than 0.5
        if accuracy < 0.5:
            accuracy = 1 - accuracy
            state_sequence_padded = 1 - state_sequence_padded
        
        assume_0 = (test_label_padded == 1).astype(int)
        assume_0_accuracy = np.mean((assume_0 == test_label_padded).astype(int))

        # Switch the labels if accuracy is less than 0.5
        if assume_0_accuracy < 0.5:
            assume_0_accuracy = 1 - assume_0_accuracy
            assume_host = 1 - assume_host

        # Calculate the confusion matrices
        confusion = confusion_matrix(test_label_padded, state_sequence_padded)
        assume_0_confusion = confusion_matrix(test_label_padded, assume_0)

        # Append the confusion matrices to the results dictionary
        results_dict[episode].append({
            'accuracy': accuracy,
            'confusion': confusion,
            'assume_0_accuracy': assume_0_accuracy,
            'assume_0_confusion': assume_0_confusion,
            'method': 'Clustering'
        })

    for host_id in df['host_id'].unique():
        # Filter the utterance and embedding dataframes for this specific host
        host_df = df[df['host_id'] == host_id]

        # Get the episodes related to this host
        host_episodes = host_df['episode'].unique()

        # Load the utterance dataframe, filtered by the host's episodes
        host_utterance_df = utterance_df[utterance_df['episode'].isin(host_episodes)]

        # Process the utterances as before
        words_series = host_utterance_df['utterance'].str.split().explode()
        unique_words = set(words_series)
        word_frequencies = words_series.value_counts().to_dict()
        word_indices = {word: i for i, word in enumerate(unique_words)}


        training_df = host_utterance_df[host_utterance_df['episode'].isin(train_episodes)]
        testing_df = host_utterance_df[host_utterance_df['episode'].isin(test_episodes)]

        # Calculate transition matrices and tensor for training data
        transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(training_df[training_df['is_host'] == True], word_indices, len(unique_words))
        transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(training_df[training_df['is_host'] == False], word_indices, len(unique_words))
        tensor, word_to_index = tensor_trouble(training_df, word_indices, len(unique_words))
        tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

        # Emission matrix for training data
        emission_matrix = make_emission(transition_matrix_host, transition_matrix_guest)


        for episode in test_episodes:
            test_episode_df = testing_df[testing_df['episode'] == episode]
            test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
            test_episode_df = test_episode_df.explode('utterance')
            test_label = test_episode_df['is_host'].to_numpy().astype(int)
            test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
            test_episode_words = [word for word in test_episode_words if word != 'nan']
            test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
            obs = [0] + test_episode_word_indices

            state_sequence = tensor_viterbi(obs, tensor_normalized, emission_matrix, initial_state_probabilities)
            test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

            accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))

            if accuracy < 0.5:
                accuracy = 1 - accuracy
                state_sequence_padded = 1 - state_sequence_padded

            assume_0 = (test_label_padded == 1).astype(int)
            assume_0_accuracy = np.mean((assume_0 == test_label_padded).astype(int))

            if assume_0_accuracy < 0.5:
                assume_0_accuracy = 1 - assume_0_accuracy
                assume_host = 1 - assume_host

            confusion = confusion_matrix(test_label_padded, state_sequence_padded)
            assume_0_confusion = confusion_matrix(test_label_padded, assume_0)

            results_dict[episode].append({
                'accuracy': accuracy,
                'confusion': confusion,
                'assume_0_accuracy': assume_0_accuracy,
                'assume_0_confusion': assume_0_confusion,
                'method': 'Host'
            })

# Save the results dictionary to a file
with open('archive/results_dict.json', 'w') as f:
    f.write(str(results_dict))
