In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import spacy
import re
from tqdm import tqdm
# Load the spaCy model for English. This needs to be installed via spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

warnings.filterwarnings("ignore")
from collections import Counter

In [56]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

word_threshold = 10

# Preprocessing function to be applied to the entire DataFrame
def preprocess_text(df):
    # Normalize text: replace dashes with spaces, remove non-alphabetic characters except spaces
    df['utterance'] = df['utterance'].str.replace('-', ' ').str.lower()
    df['utterance'] = df['utterance'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
    return df

# Function to tokenize and lemmatize with word frequency filtering
def tokenize_and_lemmatize(text, word_frequencies):
    words = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]
    filtered_tokens = [word if word_frequencies.get(word, 0) >= word_threshold else 'xxxxx' for word in lemmatized_tokens]
    return ' '.join(filtered_tokens)

def preprocess_batch(batch, word_frequencies):
    # Apply tokenization and lemmatization with word filtering
    batch['utterance'] = batch['utterance'].apply(lambda x: tokenize_and_lemmatize(x, word_frequencies))
    # batch['is_question'] = batch['utterance'].str.contains(r'\?').astype(int)
    return batch

# Main preprocessing function
def preprocess_dataframe(dataframe, batch_size=60000):
    # Apply initial preprocessing
    dataframe = preprocess_text(dataframe)
    
    # Calculate word frequencies
    all_words = ' '.join(dataframe['utterance']).split()
    word_frequencies = Counter(all_words)
    
    # Initialize progress bar
    progress = tqdm(total=len(dataframe), desc="Processing batches")
    
    processed_batches = []
    for start_row in range(0, len(dataframe), batch_size):
        end_row = start_row + batch_size
        batch = dataframe.iloc[start_row:end_row]
        processed_batch = preprocess_batch(batch, word_frequencies)
        processed_batches.append(processed_batch)
        
        # Update progress
        progress.update(len(batch))
        
    progress.close()  # Ensure the progress bar is closed after processing
    
    return pd.concat(processed_batches)

# Load the DataFrame
df = pd.read_csv('archive/utterances-2sp.csv')

# Process the DataFrame
df = preprocess_dataframe(df)

# select rows with df['episode_order'] == 1 and df['turn_order'] == 1, turn the first word of the utterance into 'yyyyy'
df.loc[(df['episode_order'] == 1) & (df['turn_order'] == 0), 'utterance'] = df.loc[(df['episode_order'] == 1) & (df['turn_order'] == 0), 'utterance'].apply(lambda x: 'yyyyy ' + ' '.join(x.split()[1:]))


# Save the processed DataFrame
# df_processed.to_csv('archive/processed_utterances-2sp.csv', index=False)


Processing batches: 100%|██████████| 1240112/1240112 [00:36<00:00, 34383.80it/s]


In [57]:
def filter_episodes(df, host_id=None):
    # Exclude host with ID -1
    df_filtered = df[df['host_id'] != -1]

    if host_id is None:
        # Find the host with the most episodes
        top_host = df_filtered.groupby('host_id')['episode'].nunique().idxmax()
    else:
        top_host = host_id

    # Get all episodes hosted by the top host
    top_host_episodes = df_filtered[df_filtered['host_id'] == top_host]['episode'].unique()

    # Filter the DataFrame to only include utterances from these episodes
    df_top_host_all_utterances = df[df['episode'].isin(top_host_episodes)]

    # Count the number of utterances in each episode
    utterance_counts = df_top_host_all_utterances.groupby('episode')['utterance'].count()

    # Get the episodes with more than 30 utterances
    episodes_over_30 = utterance_counts[utterance_counts > 30].index

    # Filter the DataFrame to only include these episodes
    df_top_host_over_30 = df_top_host_all_utterances[df_top_host_all_utterances['episode'].isin(episodes_over_30)]

    df_top_host_over_30 = df_top_host_over_30.reset_index(drop=True)

    return df_top_host_over_30

In [58]:
df = filter_episodes(df)
display(df)

Unnamed: 0,episode,episode_order,turn_order,speaker_order,host_id,is_host,utterance
0,713,1,0,0,12,True,yyyyy u ambassador to ukraine say she wa pushe...
1,713,1,1,0,12,True,and the president ha decided to allow one u al...
2,713,1,2,0,12,True,now the two are exchanging gunfire with u troo...
3,713,2,0,0,12,True,ron elving join u now our senior washington ed...
4,713,2,1,0,12,True,ron thanks so much for being with u
...,...,...,...,...,...,...,...
94509,136261,18,1,1,12,False,and you know and our little girl is three and ...
94510,136261,19,0,0,105,True,well scott thank you very much for talking wit...
94511,136261,19,1,0,105,True,and next time you go out to dinner in london
94512,136261,20,0,1,12,False,i think were going to do indian take out actually


In [60]:
# Split each utterance into a list of words and explode the DataFrame to get a row per word
words_series = df['utterance'].str.split().explode()

# Convert the series to a set to get unique words
unique_words = set(words_series)

# get word frequencies
word_frequencies = words_series.value_counts().to_dict()

In [61]:
print(f'Number of unique words: {len(unique_words)}')
word_indices = {word: i for i, word in enumerate(unique_words)}

# sort the words by frequency and display the most common words
sorted_word_frequencies = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
sorted_word_frequencies[:10]

Number of unique words: 19812


[('the', 70118),
 ('and', 41326),
 ('a', 39255),
 ('to', 38977),
 ('of', 36614),
 ('that', 27877),
 ('in', 26306),
 ('it', 20515),
 ('you', 19575),
 ('i', 18091)]

# B

In [62]:
episodes = df['episode'].unique()

# Split the filtered episodes into training and testing sets
train_episodes = np.random.choice(episodes, int(0.8 * len(episodes)), replace=False)
train = df[df['episode'].isin(train_episodes)]
test = df[~df['episode'].isin(train_episodes)]

train_host = train[train['is_host'] == True]
train_guest = train[train['is_host'] == False]

In [64]:
# select a random test episode
test_episode = np.random.choice(test['episode'].unique())
test_episode_df = test[test['episode'] == test_episode][['is_host', 'utterance']]

# Split utterances into words
test_episode_df['utterance'] = test_episode_df['utterance'].str.split()

# Expand the lists into separate rows, replicating the 'is_host' value for each word
test_episode_df = test_episode_df.explode('utterance')

# Now, each row in test_episode_df contains a single word and its corresponding 'is_host' value
test_label = test_episode_df['is_host'].to_numpy().astype(int)

# # get all of the words in the test episode
test_episode_words = ' '.join(test_episode_df['utterance']).split()

# # get the index of the words from unique_words

# # get the indices of the words in the test episode
test_episode_word_indices = [word_indices[word] for word in test_episode_words]

# save the test episode word indices
# np.save('archive/test_episode_word_indices.npy', test_episode_word_indices)

# get the in

In [65]:
def get_transition_matrix_efficient(df, word_to_index, d):
    # Initialize the transition counts matrix
    transition_counts = np.ones((d, d), dtype=int)
    
    # Iterate over each utterance
    for utterance in df['utterance']:
        if type(utterance) != str:
            try:
                utterance = str(utterance)
            except:
                continue

        words = utterance.split()
        for i in range(1, len(words)):
            word1 = words[i - 1]
            word2 = words[i]
            if word1 in word_to_index and word2 in word_to_index:  # Check if both words are in the index
                index1 = word_to_index[word1]
                index2 = word_to_index[word2]
                transition_counts[index2, index1] += 1  # Increment the count of the transition
                
    # # Handle columns with zero sum to avoid division by zero
    column_sums = transition_counts.sum(axis=0, keepdims=True)
    # zero_columns = column_sums == 0
    # column_sums[zero_columns] = 1  # Avoid division by zero by setting 0 sums to 1
    
    # Now make column stochastic
    transition_probabilities = transition_counts / column_sums

    # make 0 columns have 1 / d probability for each word
    # transition_probabilities[:, zero_columns.flatten()] = 1 / d
    
    return transition_probabilities, word_to_index

# Apply the efficient function to get the transition matrices
transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(train_host, word_indices, len(unique_words))
transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(train_guest, word_indices, len(unique_words))

In [66]:
# test if columns sum to 1
print(np.allclose(transition_matrix_host.sum(axis=0), 1))

# check for any NaN values
print(np.any(np.isnan(transition_matrix_host)))
print(np.any(np.isnan(transition_matrix_guest)))

# # save the transition matrices
# np.save('vector-data/transition_matrix_host.npy', transition_matrix_host)
# np.save('vector-data/transition_matrix_guest.npy', transition_matrix_guest)

True
False
False


# A

In [67]:
def tensor_trouble(df, word_to_index, d):
    
    tensor = np.ones((d, 2, 2), dtype=int)
    # Group by episodes and concatenate text with speaker roles
    for episode, group in df.groupby('episode'):
        # Flatten all text into one string per episode
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

        text = " ".join(group['utterance'].astype(str))
        words = text.split()

        # Replicate 'is_host' values for each word in the utterance
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        
        for i in range(2, len(words)):
            # Get the current and next words
            word = words[i - 2]
            next_role = roles[i - 1]
            following_role = roles[i]

            # Get the current and next indices
            current_index = word_to_index[word]
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1


            
            # Skip if any word is not in the index (unlikely given preprocessing, but safe practice)
            if current_index == -1:
                continue
            
            # Update the tensor based on speaker transitions
            tensor[current_index, next_role_index,current_role_index] += 1

    return tensor, word_to_index


tensor, word_to_index = tensor_trouble(train, word_indices, len(unique_words))

In [68]:
# normalize the tensor to be column stochastic for each layer
tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

tensor_normalized.shape



# check if the tensor is column stochastic, tensor_normalized.sum(axis=1) should be np.array([1., 1.])

# save the tensor
# np.save('vector-data/tensor.npy', tensor_normalized)


(19812, 2, 2)

In [69]:
# make emission matrix
def make_emission(host, guest):
    emission = np.zeros((len(host), len(host), 2))
    emission[:,:,0] = host
    emission[:,:,1] = guest

    # reorder the axis
    return np.swapaxes(emission, 0, 1)

In [70]:
def tensor_viterbi(obs, transition, emission, initial, structured = None):
    """Run the Viterbi algorithm with a conditioned tensor for the emission probabilities.
    Inputs:
        obs - ndarray (n,): observation sequence of indexes (includes a unique start token)
        transition - ndarray (d,2,2): transition tensor of probabilities (index, row, col)
        emission - ndarray (d,d,2): emission tensor of probabilities (index, row, col)
        initial - ndarray (2,): initial state probabilities
        structured - (optional) ndarray (n,): structured sequence of indexes

    Outputs:
        state_sequence - ndarray (n,): most likely state sequence of indexes
    """
    # Matt Ignore structured variable

    b_eps = 1e-25

    # Get the lengths and correct indices
    start_index = obs[0]
    obs = obs[1:]
    n = len(obs)
    d = transition.shape[0]
    
    # Initialize the viterbi matrix and the backpointers
    eta = np.zeros((n,2))
    backpointers = np.zeros((n,2), dtype=int)

    # Initialize the first row
    eta[0] = np.log(initial) + np.log(emission[start_index,obs[1],:])
    obs = obs[1:]

    # Loop through the rest of the rows
    for i in range(1,n-1):
        b = emission[obs[i-1],obs[i],:]
        # check if any of the emission probabilities are zero
        if np.any(b == 0):
            # find the index of the zero probability
            zero_index = np.where(b == 0)
            # replace the zero probability with a small epsilon value
            b[zero_index] = b_eps
            
        eta_candidate = np.log(transition[obs[i-1],:,:]) + np.log(b)[:,np.newaxis] + eta[i-1][np.newaxis, :]
        # eta_candidate = np.log(transition[obs[i-1],:,:]) * eta[i-1][np.newaxis, :] + np.log(emission[obs[i-1],obs[i],:])[:,np.newaxis] #### if statement goes here to replace 0 in the emission tensor
        eta[i] = np.max(eta_candidate, axis=1)
        backpointers[i] = np.argmax(eta_candidate, axis=1)

    # Backtrack
    state_sequence = np.zeros(n, dtype=int)
    state_sequence[-1] = np.argmax(eta[-1])
    for i in range(n-2,-1,-1):
        state_sequence[i] = backpointers[i+1,state_sequence[i+1]]

    # Return the state sequence
    return state_sequence


In [71]:
# load in the data
initial = np.array([0.9, 0.1])

In [72]:
emission = make_emission(transition_matrix_host, transition_matrix_guest)

In [82]:
# check if the word 'nan' is in test_episode_words
print('nan' in test_episode_words)

# how many times does 'nan' appear in test_episode_words
print(test_episode_words.count('nan'))

True
1


In [84]:
average_accuracy = 0
average_assume_0_accuracy = 0
accuracy_list = []
assume_0_accuracy_list = []
iters = len(test['episode'].unique())
for i in range(iters):

    # select a random test episode
    test_episode = np.random.choice(test['episode'].unique())
    test_episode_df = test[test['episode'] == test_episode][['is_host', 'utterance']]

    # Split utterances into words
    test_episode_df['utterance'] = test_episode_df['utterance'].str.split()

    # Expand the lists into separate rows, replicating the 'is_host' value for each word
    test_episode_df = test_episode_df.explode('utterance')

    # Now, each row in test_episode_df contains a single word and its corresponding 'is_host' value
    test_label = test_episode_df['is_host'].to_numpy().astype(int)

    # get all of the words in the test episode
    test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()

    # remove the word 'nan' from the test episode words
    test_episode_words = [word for word in test_episode_words if word != 'nan']

    # get the indices of the words in the test episode
    test_episode_word_indices = [word_indices[word] for word in test_episode_words]
    obs = test_episode_word_indices

    state_sequence = tensor_viterbi(obs, tensor_normalized, emission, initial)

    # get the accuracy of the viterbi algorithm
    accuracy = np.mean(state_sequence != test_label[1:])
    assume_0_accuracy = np.mean(np.zeros(len(test_label[1:])) != test_label[1:])
    if accuracy < 0.5:
        accuracy = 1 - accuracy
    if assume_0_accuracy < 0.5:
        assume_0_accuracy = 1 - assume_0_accuracy

    average_accuracy += accuracy
    average_assume_0_accuracy += assume_0_accuracy
    accuracy_list.append(accuracy)
    assume_0_accuracy_list.append(assume_0_accuracy)

print(f'Average Accuracy: {average_accuracy / iters}')
print(f'Average Assume 0 Accuracy: {average_assume_0_accuracy / iters}')
print(f'Variance of Accuracy: {np.var(accuracy_list)}')
print(f'Variance of Assume 0 Accuracy: {np.var(assume_0_accuracy_list)}')

Average Accuracy: 0.7625301157725464
Average Assume 0 Accuracy: 0.6786954007754585
Variance of Accuracy: 0.004524246132017364
Variance of Assume 0 Accuracy: 0.004798637574034621


In [79]:
# select a random test episode
test_episode = np.random.choice(test['episode'].unique())
test_episode_df = test[test['episode'] == test_episode][['is_host', 'utterance']]

# Split utterances into words
test_episode_df['utterance'] = test_episode_df['utterance'].str.split()

# Expand the lists into separate rows, replicating the 'is_host' value for each word
test_episode_df = test_episode_df.explode('utterance')

# Now, each row in test_episode_df contains a single word and its corresponding 'is_host' value
test_label = test_episode_df['is_host'].to_numpy().astype(int)

# # get all of the words in the test episode
test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()

# # get the index of the words from unique_words

# # get the indices of the words in the test episode
test_episode_word_indices = [word_indices[word] for word in test_episode_words]
obs = test_episode_word_indices

state_sequence = tensor_viterbi(obs, tensor_normalized, emission, initial)


# get the accuracy of the viterbi algorithm
accuracy = np.mean(state_sequence != test_label[1:])
assume_0_accuracy = np.mean(np.zeros(len(test_label[1:])) != test_label[1:])
if accuracy < 0.5:
    accuracy = 1 - accuracy
if assume_0_accuracy < 0.5:
    assume_0_accuracy = 1 - assume_0_accuracy

print(f'Accuracy: {accuracy}')
print(f'Episode: {test_episode}')

Accuracy: 0.8238719068413392
Episode: 77555
