In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

warnings.filterwarnings("ignore")

In [3]:
# load data
df = pd.read_csv('archive/utterances-2sp.csv')

In [4]:
def episode_with_most_utterances(df):
    """
    Identifies the episode with the most rows (utterances).

    Parameters:
    df (pd.DataFrame): DataFrame containing the columns:
                       'episode', 'episode_order', 'turn_order',
                       'speaker_order', 'host_id', 'is_host', 'utterance'

    Returns:
    str: The episode identifier with the most utterances.
    """
    # Group by 'episode', count the rows in each group, then find the episode with the maximum count
    episode_counts = df.groupby('episode').size()
    max_utterances_episode = episode_counts.idxmax()
    
    return max_utterances_episode

largest_idx = episode_with_most_utterances(df)

In [5]:
# get the largest episode, column "episode" with index largest_idx
largest_episode = df[df['episode'] == largest_idx]

In [11]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the function for tokenizing and lemmatizing
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Define the function to preprocess the data and create a new DataFrame of tokens
def create_token_dataframe(dataframe):
    # Process each utterance to tokenize and lemmatize
    dataframe['utterance'] = dataframe['utterance'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    dataframe['tokens'] = dataframe['utterance'].apply(tokenize_and_lemmatize)
    # dataframe['num_tokens'] = [len(tokens) for tokens in dataframe['tokens']]
    
    # Explode the DataFrame to have each token as a separate row
    tokens_df = dataframe.explode('tokens').rename(columns={'tokens': 'token'})
    tokens_df = tokens_df.drop(columns=['utterance'])  # Optionally remove the original 'utterance' column

    return tokens_df

def build_vocabulary(tokens_df):
    """Builds a vocabulary from the tokens DataFrame."""
    vocabulary = {word: idx for idx, word in enumerate(tokens_df['token'].unique())}
    return vocabulary

def one_hot_encode(tokens_df, vocabulary):
    """
    One-hot encodes the tokens based on the provided vocabulary.
    
    Parameters:
    - tokens_df (pd.DataFrame): DataFrame where each row contains a token.
    - vocabulary (dict): A dictionary mapping words to indices.
    
    Returns:
    - csr_matrix: The one-hot encoded representation of the tokens.
    - list: A list of words corresponding to each row of the one-hot encoded matrix.
    """
    # Initialize a matrix of zeros with shape (number of tokens, vocabulary size)
    data = np.zeros((len(tokens_df), len(vocabulary)), dtype=int)
    
    # Prepare a list to store the labels (words) in order
    labels = []
    
    # For each token, set the appropriate element to 1
    for i, token in enumerate(tokens_df['token']):
        index = vocabulary[token]
        data[i, index] = 1
        labels.append(token)  # Add the word to the labels list

    # Convert to a sparse CSR matrix for efficiency
    one_hot_encoded_data = csr_matrix(data)
    return one_hot_encoded_data, labels


In [12]:
# Preprocess and vectorize the largest episode
tokens_df = create_token_dataframe(largest_episode)
vocabulary = build_vocabulary(tokens_df)
one_hot_encoded_data, labels = one_hot_encode(tokens_df, vocabulary)
one_hot_encoded_data.shape
# labels

(3666, 397)

In [13]:
obs = one_hot_encoded_data.toarray()
n = 2
m = obs.shape[1]

obs_flat = np.nonzero(obs)[1]

# hmm
h = hmm.CategoricalHMM(n_components=2, n_iter=200, tol=1e-4)
h.fit(obs_flat.reshape(-1, 1))
# get the hidden states
hidden_states = h.predict(obs_flat.reshape(-1, 1))

print(len(hidden_states))

# create labels from speaker_order column
labels = tokens_df['speaker_order'].values
print(len(labels))

# accuracy
accuracy = np.mean(hidden_states == labels)
print(accuracy)

3666
3666
0.506001091107474


In [14]:
1455402 / 3666

397.0