In [1]:
import numpy as np
import pandas as pd
import warnings
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
from tqdm.notebook import tqdm
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from collections import Counter

warnings.filterwarnings("ignore")

In [4]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

word_threshold = 2

# Precompiled regex patterns for efficiency
pattern_dash = re.compile(r'-')
pattern_non_alpha = re.compile(r'[^a-z\s]')

# Preprocessing function to be applied to the entire DataFrame
def preprocess_text(text):
    """Efficient text normalization and preprocessing."""
    text = pattern_dash.sub(' ', text.lower())
    text = pattern_non_alpha.sub('', text)
    return text

def calculate_word_frequencies(df):
    """Calculate word frequencies in a DataFrame."""
    all_words = ' '.join(df['utterance']).split()
    return Counter(all_words)

def filter_low_frequency_words(word_frequencies, threshold):
    """Remove words with frequencies below a threshold."""
    return {word: freq for word, freq in word_frequencies.items() if freq >= threshold}

def lemmatize_and_filter(tokens, word_frequencies):
    """Lemmatize tokens and filter based on frequency."""
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return [token if token in word_frequencies else 'xxxxx' for token in lemmatized_tokens]

def preprocess_dataframe(dataframe):
    """Preprocess the entire DataFrame."""
    dataframe['utterance'] = dataframe['utterance'].apply(preprocess_text)
    word_frequencies = calculate_word_frequencies(dataframe)
    filtered_frequencies = filter_low_frequency_words(word_frequencies, 10)
    dataframe['utterance'] = dataframe['utterance'].apply(lambda x: ' '.join(lemmatize_and_filter(x.split(), filtered_frequencies)))
    dataframe['is_question'] = dataframe['utterance'].str.contains(r'\?').astype(int)
    return dataframe

def get_transition_matrix(df, word_to_index, d):
    """Calculate transition matrix for word sequences in DataFrame."""
    # Initialize the transition matrix with zeros
    transition_counts = np.zeros((d, d), dtype=np.int32)
    
    for _, row in df.iterrows():
        words = row['utterance'].split()
        # Convert words to indices, ignoring those not found in `word_to_index`
        indices = [word_to_index[word] for word in words if word in word_to_index]
        
        for i in range(1, len(indices)):
            prev_index = indices[i - 1]
            curr_index = indices[i]
            transition_counts[curr_index, prev_index] += 1
    
    # Convert counts to probabilities
    column_sums = transition_counts.sum(axis=0)
    # Avoid division by zero
    with np.errstate(divide='ignore', invalid='ignore'):
        transition_probabilities = np.divide(transition_counts, column_sums, where=column_sums!=0)
        transition_probabilities[:, column_sums==0] = 1.0 / d  # Assign equal probability if no transitions
    
    return transition_probabilities

def tensor_trouble(df, word_to_index, d):
    
    tensor = np.zeros((d, 2, 2), dtype=int)
    # Group by episodes and concatenate text with speaker roles
    for episode, group in df.groupby('episode'):
        # Flatten all text into one string per episode
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

        text = " ".join(group['utterance'].astype(str))
        words = text.split()

        # Replicate 'is_host' values for each word in the utterance
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        
        for i in range(2, len(words)):
            # Get the current and next words
            word = words[i - 2]
            next_role = roles[i - 1]
            following_role = roles[i]

            # Get the current and next indices
            current_index = word_to_index[word]
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1


            
            # Skip if any word is not in the index (unlikely given preprocessing, but safe practice)
            if current_index == -1:
                continue
            
            # Update the tensor based on speaker transitions
            tensor[current_index, next_role_index,current_role_index] += 1

    return tensor, word_to_index


# make emission matrix
def make_emission(host, guest):
    emission = np.zeros((len(host), len(host), 2))
    emission[:,:,0] = host
    emission[:,:,1] = guest

    # reorder the axis
    return np.swapaxes(emission, 0, 1)


def tensor_viterbi(obs, transition, emission, initial, structured = None):
    """Run the Viterbi algorithm with a conditioned tensor for the emission probabilities.
    Inputs:
        obs - ndarray (n,): observation sequence of indexes (includes a unique start token)
        transition - ndarray (d,2,2): transition tensor of probabilities (index, row, col)
        emission - ndarray (d,d,2): emission tensor of probabilities (index, row, col)
        initial - ndarray (2,): initial state probabilities
        structured - (optional) ndarray (n,): structured sequence of indexes

    Outputs:
        state_sequence - ndarray (n,): most likely state sequence of indexes
    """
    # Matt Ignore structured variable

    b_eps = 1e-25

    # Get the lengths and correct indices
    start_index = obs[0]
    obs = obs[1:]
    n = len(obs)
    d = transition.shape[0]
    
    # Initialize the viterbi matrix and the backpointers
    eta = np.zeros((n,2))
    backpointers = np.zeros((n,2), dtype=int)

    # Initialize the first row
    eta[0] = np.log(initial) + np.log(emission[start_index,obs[1],:])
    obs = obs[1:]

    # Loop through the rest of the rows
    for i in range(1,n-1):
        b = emission[obs[i-1],obs[i],:]
        # check if any of the emission probabilities are zero
        if np.any(b == 0):
            # find the index of the zero probability
            zero_index = np.where(b == 0)
            # replace the zero probability with a small epsilon value
            b[zero_index] = b_eps
            
        eta_candidate = np.log(transition[obs[i-1],:,:]) + np.log(b)[:,np.newaxis] + eta[i-1][np.newaxis, :]
        # eta_candidate = np.log(transition[obs[i-1],:,:]) * eta[i-1][np.newaxis, :] + np.log(emission[obs[i-1],obs[i],:])[:,np.newaxis] #### if statement goes here to replace 0 in the emission tensor
        eta[i] = np.max(eta_candidate, axis=1)
        backpointers[i] = np.argmax(eta_candidate, axis=1)

    # Backtrack
    state_sequence = np.zeros(n, dtype=int)
    state_sequence[-1] = np.argmax(eta[-1])
    for i in range(n-2,-1,-1):
        state_sequence[i] = backpointers[i+1,state_sequence[i+1]]

    # Return the state sequence
    return state_sequence



# Load the DataFrame
df = pd.read_csv('archive/utterances-2sp.csv')

# Process the DataFrame
df_processed = preprocess_dataframe(df)

# select rows with df['episode_order'] == 1 and df['turn_order'] == 1, turn the first word of the utterance into 'yyyyy'
df_processed.loc[(df_processed['episode_order'] == 1) & (df_processed['turn_order'] == 0), 'utterance'] = df_processed.loc[(df_processed['episode_order'] == 1) & (df_processed['turn_order'] == 0), 'utterance'].apply(lambda x: 'yyyyy ' + ' '.join(x.split()[1:]))

print('Dataframe processed')

# Split each utterance into a list of words and explode the DataFrame to get a row per word
words_series = df_processed['utterance'].str.split().explode()

# Convert the series to a set to get unique words
unique_words = set(words_series)

word_indices = {word: i for i, word in enumerate(unique_words)}

episodes = df_processed['episode'].unique()

# Split the filtered episodes into training and testing sets
train_episodes = np.random.choice(episodes, int(0.8 * len(episodes)), replace=False)
train = df_processed[df_processed['episode'].isin(train_episodes)]
test = df_processed[~df_processed['episode'].isin(train_episodes)]

train_host = train[train['is_host'] == True]
train_guest = train[train['is_host'] == False]

print('Dataframe split')

# Apply the efficient function to get the transition matrices
transition_matrix_host = get_transition_matrix(train_host, word_indices, len(unique_words))
transition_matrix_guest = get_transition_matrix(train_guest, word_indices, len(unique_words))

print('Transition matrices calculated')

tensor, word_to_index = tensor_trouble(train, word_indices, len(unique_words))

tensor_normalized = tensor / (tensor.sum(axis=1, keepdims=True) +  1e-9)

print('Tensor calculated')

initial = np.array([0.9, 0.1])

emission = make_emission(transition_matrix_host, transition_matrix_guest)

print('Emission calculated')

average_accuracy = 0
average_assume_0_accuracy = 0
iters = 200
print('Starting iterations')
for i in range(iters):

    # select a random test episode
    test_episode = np.random.choice(test['episode'].unique())
    test_episode_df = test[test['episode'] == test_episode][['is_host', 'utterance']]

    # Split utterances into words
    test_episode_df['utterance'] = test_episode_df['utterance'].str.split()

    # Expand the lists into separate rows, replicating the 'is_host' value for each word
    test_episode_df = test_episode_df.explode('utterance')

    # Now, each row in test_episode_df contains a single word and its corresponding 'is_host' value
    test_label = test_episode_df['is_host'].to_numpy().astype(int)

    # # get all of the words in the test episode
    test_episode_words = ' '.join(test_episode_df['utterance'].astype(str)).split()


    #  get the indices of the words in the test episode
    test_episode_word_indices = [word_indices[word] for word in test_episode_words]
    obs = test_episode_word_indices

    state_sequence = tensor_viterbi(obs, tensor_normalized, emission, initial)


    # get the accuracy of the viterbi algorithm
    accuracy = np.mean(state_sequence != test_label[1:])
    assume_0_accuracy = np.mean(np.zeros(len(test_label[1:])) != test_label[1:])
    if accuracy < 0.5:
        accuracy = 1 - accuracy
    if assume_0_accuracy < 0.5:
        assume_0_accuracy = 1 - assume_0_accuracy
    # print(f'Accuracy: {accuracy}')
    # print(f'Episode: {test_episode}')
    # print(f'Length of state sequence: {len(state_sequence)}')
    # print(f'Percentage of host words: {np.mean(state_sequence)}')
    average_accuracy += accuracy
    average_assume_0_accuracy += assume_0_accuracy

print(f'Average Accuracy: {average_accuracy / iters}')
print(f'Average Assume 0 Accuracy: {average_assume_0_accuracy / iters}')

Dataframe processed
Dataframe split
Transition matrices calculated
Tensor calculated
