In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import nltk
import warnings

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

warnings.filterwarnings("ignore")

In [3]:
# load data
df = pd.read_csv('archive/utterances-2sp.csv')

In [7]:
def episode_with_most_utterances(df):
    """
    Identifies the episode with the most rows (utterances).

    Parameters:
    df (pd.DataFrame): DataFrame containing the columns:
                       'episode', 'episode_order', 'turn_order',
                       'speaker_order', 'host_id', 'is_host', 'utterance'

    Returns:
    str: The episode identifier with the most utterances.
    """
    # Group by 'episode', count the rows in each group, then find the episode with the maximum count
    episode_counts = df.groupby('episode').size()
    max_utterances_episode = episode_counts.idxmax()
    
    return max_utterances_episode

largest_idx = episode_with_most_utterances(df)

In [8]:
# get the largest episode, column "episode" with index largest_idx
largest_episode = df[df['episode'] == largest_idx]

In [None]:
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)
def preprocess_and_vectorize_batch(dataframe, vectorizer=None, batch_size=10000):
    if vectorizer is None:
        vectorizer = CountVectorizer()
        is_fit = False
    else:
        is_fit = True
    
    processed_batches = []
    for start_row in range(0, dataframe.shape[0], batch_size):
        # Explicitly make a copy of the batch to avoid SettingWithCopyWarning
        batch = dataframe.iloc[start_row:start_row+batch_size].copy()
        batch['utterance'] = batch['utterance'].str.lower()
        batch['utterance'] = batch['utterance'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        batch['utterance'] = batch['utterance'].apply(tokenize_and_lemmatize)
        
        if not is_fit:
            vectorized_batch = vectorizer.fit_transform(batch['utterance'])
            is_fit = True
        else:
            vectorized_batch = vectorizer.transform(batch['utterance'])
        
        processed_batches.append(vectorized_batch)
    
    vectorized_data = vstack(processed_batches)