In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

warnings.filterwarnings("ignore")

In [77]:
# load data
df = pd.read_csv('archive/utterances-2sp.csv')

In [4]:
def episode_with_most_utterances(df):
    """
    Identifies the episode with the most rows (utterances).

    Parameters:
    df (pd.DataFrame): DataFrame containing the columns:
                       'episode', 'episode_order', 'turn_order',
                       'speaker_order', 'host_id', 'is_host', 'utterance'

    Returns:
    str: The episode identifier with the most utterances.
    """
    # Group by 'episode', count the rows in each group, then find the episode with the maximum count
    episode_counts = df.groupby('episode').size()
    max_utterances_episode = episode_counts.idxmax()
    
    return max_utterances_episode

largest_idx = episode_with_most_utterances(df)

In [71]:
# get the largest episode, column "episode" with index largest_idx
largest_episode = df[df['episode'] == largest_idx]

In [20]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the function for tokenizing and lemmatizing
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Define the function to preprocess the data and create a new DataFrame of tokens
def create_token_dataframe(dataframe):
    # Process each utterance to tokenize and lemmatize
    dataframe['utterance'] = dataframe['utterance'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    dataframe['tokens'] = dataframe['utterance'].apply(tokenize_and_lemmatize)
    # dataframe['num_tokens'] = [len(tokens) for tokens in dataframe['tokens']]
    
    # Explode the DataFrame to have each token as a separate row
    tokens_df = dataframe.explode('tokens').rename(columns={'tokens': 'token'})
    tokens_df = tokens_df.drop(columns=['utterance'])  # Optionally remove the original 'utterance' column

    return tokens_df

def build_vocabulary(tokens_df):
    """Builds a vocabulary from the tokens DataFrame."""
    vocabulary = {word: idx for idx, word in enumerate(tokens_df['token'].unique())}
    return vocabulary

def one_hot_encode(tokens_df, vocabulary):
    """
    One-hot encodes the tokens based on the provided vocabulary.
    
    Parameters:
    - tokens_df (pd.DataFrame): DataFrame where each row contains a token.
    - vocabulary (dict): A dictionary mapping words to indices.
    
    Returns:
    - csr_matrix: The one-hot encoded representation of the tokens.
    - list: A list of words corresponding to each row of the one-hot encoded matrix.
    """
    # Initialize a matrix of zeros with shape (number of tokens, vocabulary size)
    # data = np.zeros((len(tokens_df), len(vocabulary)), dtype=int)
    
    # Prepare a list to store the labels (words) in order
    labels = []
    
    # For each token, set the appropriate element to 1
    for i, token in enumerate(tokens_df['token']):
        # index = vocabulary[token]
        # data[i, index] = 1
        labels.append(token)  # Add the word to the labels list

    # Convert to a sparse CSR matrix for efficiency
    # one_hot_encoded_data = csr_matrix(data)
    return labels


In [62]:
import pandas as pd
import spacy
# Load the spaCy model for English. This needs to be installed via spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Lowercase and remove punctuation from text using vectorized operations."""
    text = text.str.lower()
    text = text.str.replace('[{}]'.format(string.punctuation), '', regex=True)
    return text

def tokenize_and_lemmatize_batch(text_series):
    """Tokenize and lemmatize a series of texts using spaCy."""
    # Process the texts as a batch using spaCy
    docs = list(nlp.pipe(text_series))
    # Extract lemmatized tokens for each document
    tokens = [[token.lemma_ for token in doc] for doc in docs]
    return tokens

def create_token_dataframe_optimized(dataframe):
    """Optimized DataFrame creation with tokenization and lemmatization."""
    # Preprocess the utterance column
    dataframe['utterance'] = preprocess_text(dataframe['utterance'])
    # Tokenize and lemmatize in batch mode
    dataframe['tokens'] = tokenize_and_lemmatize_batch(dataframe['utterance'])
    # Explode the DataFrame to have each token as a separate row
    tokens_df = dataframe.explode('tokens').rename(columns={'tokens': 'token'})
    tokens_df = tokens_df.drop(columns=['utterance'])  # Optionally remove the original 'utterance' column
    return tokens_df

def build_vocabulary(tokens_df):
    """Builds a vocabulary from the tokens DataFrame, using efficient pandas operations."""
    vocabulary = tokens_df['token'].drop_duplicates().reset_index(drop=True).to_dict()
    vocabulary = {v: k for k, v in vocabulary.items()}
    return vocabulary

def get_token_indices(tokens, vocabulary):
    """Get the indices of tokens based on the vocabulary."""
    indices = [vocabulary[token] for token in tokens]
    return indices

In [72]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function that will be used for tokenizing and lemmatizing
def tokenize_and_lemmatize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

# Modify the preprocess_and_vectorize_batch function
def preprocess_and_vectorize_batch(dataframe, vectorizer=None, batch_size=10000):
    if vectorizer is None:
        # Directly integrate tokenize_and_lemmatize with the vectorizer
        vectorizer = CountVectorizer(tokenizer=tokenize_and_lemmatize, preprocessor=None, lowercase=True)
        is_fit = False
    else:
        is_fit = True
    
    processed_batches = []
    for start_row in range(0, dataframe.shape[0], batch_size):
        batch = dataframe.iloc[start_row:start_row+batch_size].copy()
        batch['utterance'] = batch['utterance'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

        if not is_fit:
            vectorized_batch = vectorizer.fit_transform(batch['utterance'])
            is_fit = True
        else:
            vectorized_batch = vectorizer.transform(batch['utterance'])
        
        processed_batches.append(vectorized_batch)
    
    vectorized_data = vstack(processed_batches)
    return vectorized_data, vectorizer

In [76]:
vectorized_data, vectorizer = preprocess_and_vectorize_batch(largest_episode)

In [63]:
# Preprocess and vectorize the largest episode
tokens_df = create_token_dataframe(largest_episode)
vocabulary = build_vocabulary(tokens_df)
labels = get_token_indices(tokens_df['token'], vocabulary)
label_set = set(labels)
labels_as_indices = [list(label_set).index(label) for label in labels]
print(labels_as_indices)
# one_hot_encoded_data.shape
# labels
obs = np.array([labels_as_indices])
n = 2
m = obs.shape[1]

# obs_flat = np.nonzero(obs)[1]

# hmm
h = hmm.CategoricalHMM(n_components=2, n_iter=200, tol=1e-4)
h.fit(obs.reshape(-1, 1))
# get the hidden states
hidden_states = h.predict(obs.reshape(-1, 1))

print(len(hidden_states))

# create labels from speaker_order column
labels = tokens_df['speaker_order'].values
print(len(labels))

# accuracy
accuracy = np.mean(hidden_states == labels)
print(accuracy)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 5, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 5, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 5, 9, 10, 11, 12, 13, 14, 15, 16, 11, 17, 10, 18, 19, 20, 21, 22, 15, 16, 11, 17, 10, 18, 19, 20, 21, 22, 15, 16, 11, 17, 10, 18, 19, 20, 21, 22, 15, 23, 10, 19, 24, 25, 26, 27, 28, 29, 11, 13, 30, 31, 32, 33, 15, 23, 10, 19, 24, 25, 26, 27, 28, 29, 11, 13, 30, 31, 32, 33, 15, 23, 10, 19, 24, 25, 26, 27, 28, 29, 11, 13, 30, 31, 32, 33, 7, 34, 35, 36, 37, 38, 39, 40, 32, 5, 41, 42, 43, 7, 34, 35, 36, 37, 38, 39, 40, 32, 5, 41, 42, 43, 7, 34, 35, 36, 37, 38, 39, 40, 32, 5, 41, 42, 43, 44, 45, 46, 47, 48, 11, 49, 50, 44, 45, 46, 47, 48, 11, 49, 50, 44, 45, 46, 47, 48, 11, 49, 50, 51, 52, 53, 54, 55, 56, 51, 52, 53, 54, 55, 56, 51, 52, 53, 54, 55, 56, 57, 57, 57, 58, 59, 58, 59, 58, 59, 5, 6, 60, 61, 62, 63, 15, 64, 55, 44, 45, 5, 6, 60, 61, 62, 63, 15, 64, 55, 44, 45, 5, 6, 60, 61, 62, 63, 15, 64, 55, 44, 45, 65, 44, 45, 65, 44, 45, 65, 44, 45, 66, 55, 67, 

In [54]:
def filter_df_by_host(df, host_id):
    """
    Filters the DataFrame to include all rows from episodes hosted by the specified host_id.
    
    Parameters:
    - df (pd.DataFrame): The original DataFrame with columns 'episode', 'episode_order', 'turn_order',
                         'speaker_order', 'host_id', 'is_host', 'utterance'.
    - host_id (int): The host ID to filter episodes by.
    
    Returns:
    - pd.DataFrame: Filtered DataFrame including all rows from episodes hosted by the given host_id.
    """
    # Identify episodes hosted by the given host_id
    hosted_episodes = df[(df['host_id'] == host_id) & (df['is_host'] == True)]['episode'].unique()
    
    # Filter the DataFrame for rows belonging to these episodes
    filtered_df = df[df['episode'].isin(hosted_episodes)]
    
    return filtered_df

# Example usage:
# Replace 123 with the actual host_id you're interested in
filtered_df = filter_df_by_host(df, host_id=12)
display(filtered_df)

Unnamed: 0,episode,episode_order,turn_order,speaker_order,host_id,is_host,utterance
14199,713,1,0,0,12,True,The U.S. ambassador to Ukraine says she was pu...
14200,713,1,1,0,12,True,And the president has decided to allow one U.S...
14201,713,1,2,0,12,True,Now the two are exchanging gunfire with U.S. t...
14202,713,2,0,0,12,True,"Ron Elving joins us now, our senior Washington..."
14203,713,2,1,0,12,True,"Ron, thanks so much for being with us."
...,...,...,...,...,...,...,...
1134639,129584,22,3,1,-1,False,She's the defending champion at the Olympics f...
1134640,129584,23,0,0,12,True,"Mr. Brown, thanks so much for your time."
1134641,129584,24,0,1,-1,False,All right.
1134642,129584,24,1,1,-1,False,Thank you as well.


In [52]:
filtered_df['episode'].nunique()

2032

In [68]:
# Preprocess and vectorize the largest episode
tokens_df = create_token_dataframe(filtered_df)
print('part 1')
vocabulary = build_vocabulary(tokens_df)
print('part 2')
labels = get_token_indices(tokens_df['token'], vocabulary)
# print(labels)
print('part 3')


obs = np.array([labels])
n = 2
m = obs.shape[1]

# obs_flat = np.nonzero(obs)[1]

# hmm
h = hmm.CategoricalHMM(n_components=n, n_iter=200, tol=1e-4)
h.fit(obs.reshape(-1, 1))
print('part 5')
# get the hidden states
hidden_states = h.predict(obs.reshape(-1, 1))

print(len(hidden_states))

# create labels from speaker_order column
labels = tokens_df['speaker_order'].values
print(len(labels))

# accuracy
accuracy = np.mean(hidden_states == labels)
print(accuracy)

part 1
part 2
part 5
part 6
part 7
1482808
1482808
0.5415286402555152
