In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import spacy
import re
from tqdm import tqdm
from collections import Counter

In [10]:
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")
warnings.filterwarnings("ignore")

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
word_threshold = 25

def aggressive_cleaning(text):
    """
    Cleans the text by handling NaN values and 'nan' strings, normalizing dashes, lowercasing,
    and removing non-alphabetic characters except spaces.
    """
    text = str(text)
    if text.lower() == 'nan' or pd.isna(text):
        return ''
    else:
        text = text.replace('-', ' ').lower().strip()
        text = re.sub(r'[^a-z\s]', '', text)
        return text

def tokenize_and_lemmatize(text, word_frequencies):
    """
    Tokenizes and lemmatizes the given text, applying a word frequency filter.
    Words below the threshold are replaced with 'xxxxx'.
    """
    words = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]
    filtered_tokens = [word if word_frequencies.get(word, 0) >= word_threshold else 'xxxxx' for word in lemmatized_tokens]
    return ' '.join(filtered_tokens)

def preprocess_dataframe(dataframe, batch_size=60000):
    """
    Applies preprocessing to the entire DataFrame, including cleaning text, tokenizing,
    lemmatizing, and filtering based on word frequencies.
    """
    # Apply aggressive cleaning
    dataframe['utterance'] = dataframe['utterance'].apply(aggressive_cleaning)
    
    all_words = ' '.join(dataframe['utterance']).split()
    word_frequencies = Counter(all_words)
    
    processed_batches = []
    progress = tqdm(total=len(dataframe), desc="Processing batches")
    
    for start_row in range(0, len(dataframe), batch_size):
        end_row = start_row + batch_size
        batch = dataframe.iloc[start_row:end_row]
        batch['utterance'] = batch['utterance'].apply(lambda x: tokenize_and_lemmatize(x, word_frequencies))
        processed_batches.append(batch)
        progress.update(len(batch))
    
    progress.close()
    return pd.concat(processed_batches)


def filter_episodes(df, host_id=None):
    df_filtered = df[df['host_id'] != -1]
    if host_id is None:
        top_host = df_filtered.groupby('host_id')['episode'].nunique().idxmax()
    else:
        top_host = host_id
    top_host_episodes = df_filtered[df_filtered['host_id'] == top_host]['episode'].unique()
    df_top_host_all_utterances = df[df['episode'].isin(top_host_episodes)]
    utterance_counts = df_top_host_all_utterances.groupby('episode')['utterance'].count()
    episodes_over_30 = utterance_counts[utterance_counts > 30].index
    df_top_host_over_30 = df_top_host_all_utterances[df_top_host_all_utterances['episode'].isin(episodes_over_30)]
    return df_top_host_over_30.reset_index(drop=True)

def get_transition_matrix_efficient(df, word_to_index, d):
    transition_counts = np.ones((d, d), dtype=int)
    for utterance in df['utterance']:
        if type(utterance) != str:
            try:
                utterance = str(utterance)
            except:
                continue
        words = utterance.split()
        for i in range(1, len(words)):
            word1 = words[i - 1]
            word2 = words[i]
            if word1 in word_to_index and word2 in word_to_index:
                index1 = word_to_index[word1]
                index2 = word_to_index[word2]
                transition_counts[index2, index1] += 1
    column_sums = transition_counts.sum(axis=0, keepdims=True)
    transition_probabilities = transition_counts / column_sums
    return transition_probabilities, word_to_index

def tensor_trouble(df, word_to_index, d):
    tensor = np.ones((d, 2, 2), dtype=int)
    for episode, group in df.groupby('episode'):
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
        text = " ".join(group['utterance'].astype(str))
        words = text.split()
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        for i in range(2, len(words)):
            word = words[i - 2]
            if word not in word_to_index:  # Check if the word exists in the dictionary
                continue  # Skip this iteration if the word is not found
            next_role = roles[i - 1]
            following_role = roles[i]
            current_index = word_to_index[word]
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1
            tensor[current_index, next_role_index, current_role_index] += 1
    return tensor, word_to_index

def make_emission(host, guest):
    emission = np.zeros((len(host), len(host), 2))
    emission[:, :, 0] = host
    emission[:, :, 1] = guest
    return np.swapaxes(emission, 0, 1)

def tensor_viterbi(obs, transition, emission, initial):
    b_eps = 1e-25
    start_index = obs[0]
    obs = obs[1:]
    n = len(obs)
    d = transition.shape[0]
    eta = np.zeros((n, 2))
    backpointers = np.zeros((n, 2), dtype=int)
    eta[0] = np.log(initial) + np.log(emission[start_index, obs[1], :])
    obs = obs[1:]
    for i in range(1, n - 1):
        b = emission[obs[i - 1], obs[i], :]
        if np.any(b == 0):
            zero_index = np.where(b == 0)
            b[zero_index] = b_eps
        eta_candidate = np.log(transition[obs[i - 1], :, :]) + np.log(b)[:, np.newaxis] + eta[i - 1][np.newaxis, :]
        eta[i] = np.max(eta_candidate, axis=1)
        backpointers[i] = np.argmax(eta_candidate, axis=1)
    state_sequence = np.zeros(n, dtype=int)
    state_sequence[-1] = np.argmax(eta[-1])
    for i in range(n - 2, -1, -1):
        state_sequence[i] = backpointers[i + 1, state_sequence[i + 1]]
    return state_sequence

def pad_to_match(a, b):
    """
    Pads the shorter array with its last element to match the length of the longer array.
    
    Args:
        a (np.array): First array for comparison.
        b (np.array): Second array for comparison.
        
    Returns:
        np.array, np.array: The two arrays modified to have equal lengths.
    """
    if len(a) == len(b):
        return a, b
    elif len(a) > len(b):
        padding = np.full(len(a) - len(b), b[-1])
        b_padded = np.concatenate((b, padding))
        return a, b_padded
    else:
        padding = np.full(len(b) - len(a), a[-1])
        a_padded = np.concatenate((a, padding))
        return a_padded, b

# Load and process the DataFrame
# df = pd.read_csv('archive/utterances-2sp.csv')
# df = preprocess_dataframe(df)
# df.loc[(df['episode_order'] == 1) & (df['turn_order'] == 0), 'utterance'] = df.loc[(df['episode_order'] == 1) & (df['turn_order'] == 0), 'utterance'].apply(lambda x: 'yyyyy ' + ' '.join(x.split()))

df = pd.read_csv('archive/processed_utterances-2sp.csv')
for host_id in range(1, 20):

    print(f'Host ID: {host_id}')


    filtered_df = filter_episodes(df, host_id=host_id)

    print(f'Number of episodes: {len(filtered_df["episode"].unique())}')
    print(f'Number of utterances: {len(filtered_df)}')

    # load the preprocessed data
    # filtered_df = pd.read_csv('archive/most_episodes_host.csv')

    # Split each utterance into a list of words and explode the DataFrame to get a row per word
    words_series = filtered_df['utterance'].str.split().explode()
    unique_words = set(words_series)
    word_frequencies = words_series.value_counts().to_dict()
    word_indices = {word: i for i, word in enumerate(unique_words)}

    # print the number of unique words
    print(f'Number of unique words: {len(unique_words)}')

    transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(filtered_df[filtered_df['is_host'] == True], word_indices, len(unique_words))
    transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(filtered_df[filtered_df['is_host'] == False], word_indices, len(unique_words))
    tensor, word_to_index = tensor_trouble(filtered_df, word_indices, len(unique_words))
    tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True))

    initial = np.array([0.5, 0.5])
    emission = make_emission(transition_matrix_host, transition_matrix_guest)

    episodes = filtered_df['episode'].unique()
    np.random.shuffle(episodes)
    split_index = int(len(episodes) * 0.8)
    train_episodes = episodes[:split_index]
    test_episodes = episodes[split_index:]

    test = filtered_df[filtered_df['episode'].isin(test_episodes)]

    average_accuracy = 0
    average_assume_0_accuracy = 0
    accuracy_list = []
    assume_0_accuracy_list = []
    iters = len(test['episode'].unique())

    for i in range(iters):
        test_episode = np.random.choice(test['episode'].unique())
        test_episode_df = test[test['episode'] == test_episode][['is_host', 'utterance']]
        test_episode_df['utterance'] = test_episode_df['utterance'].str.split()
        test_episode_df = test_episode_df.explode('utterance')
        test_label = test_episode_df['is_host'].to_numpy().astype(int)
        test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()
        test_episode_words = [word for word in test_episode_words if word != 'nan']
        test_episode_word_indices = [word_indices.get(word, -1) for word in test_episode_words if word in word_indices]
        obs = [0] + test_episode_word_indices  # Add start token index (e.g., 0) if your Viterbi expects it

        state_sequence = tensor_viterbi(obs, tensor_normalized, emission, initial)
        # print(len(state_sequence))
        test_label_padded, state_sequence_padded = pad_to_match(test_label, state_sequence)

        # Now, calculate the accuracy
        accuracy = np.mean((state_sequence_padded == test_label_padded).astype(int))
        assume_0_accuracy = np.mean((test_label_padded == 0).astype(int))
        accuracy = max(accuracy, 1 - accuracy)  # Adjust based on expected behavior
        assume_0_accuracy = max(assume_0_accuracy, 1 - assume_0_accuracy)

        average_accuracy += accuracy
        average_assume_0_accuracy += assume_0_accuracy
        accuracy_list.append(accuracy)
        assume_0_accuracy_list.append(assume_0_accuracy)

    print(f'Average Accuracy: {average_accuracy / iters}')
    print(f'Average Assume 0 Accuracy: {average_assume_0_accuracy / iters}')
    print(f'Variance of Accuracy: {np.var(accuracy_list)}')
    print(f'Variance of Assume 0 Accuracy: {np.var(assume_0_accuracy_list)}')

Host ID: 1
Number of episodes: 907
Number of utterances: 46765
Number of unique words: 12754
Average Accuracy: 0.8641932699763092
Average Assume 0 Accuracy: 0.6843869052940297
Variance of Accuracy: 0.005663392175876443
Variance of Assume 0 Accuracy: 0.007438823836104866
Host ID: 2
Number of episodes: 11
Number of utterances: 580
Number of unique words: 1504
Average Accuracy: 0.9763361738843809
Average Assume 0 Accuracy: 0.7651988870468934
Variance of Accuracy: 2.829849916219758e-07
Variance of Assume 0 Accuracy: 0.00027726496123296415
Host ID: 3
Number of episodes: 4
Number of utterances: 248
Number of unique words: 807
Average Accuracy: 0.9719626168224299
Average Assume 0 Accuracy: 0.602803738317757
Variance of Accuracy: 0.0
Variance of Assume 0 Accuracy: 0.0
Host ID: 4
Number of episodes: 23
Number of utterances: 1345
Number of unique words: 2731
Average Accuracy: 0.9689558003497833
Average Assume 0 Accuracy: 0.5935444375192593
Variance of Accuracy: 2.4927755934466272e-05
Variance of

In [23]:
df.to_csv('archive/processed_utterances-2sp.csv', index=False)

In [17]:
filtered_df.to_csv('archive/most_episodes_host.csv', index=False)