In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import spacy
import re
# Load the spaCy model for English. This needs to be installed via spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

warnings.filterwarnings("ignore")
from collections import Counter

In [2]:
from tqdm.notebook import tqdm
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

word_threshold = 10

# Preprocessing function to be applied to the entire DataFrame
def preprocess_text(df):
    # Normalize text: replace dashes with spaces, remove non-alphabetic characters except spaces
    df['utterance'] = df['utterance'].str.replace('-', ' ').str.lower()
    df['utterance'] = df['utterance'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
    return df

# Function to tokenize and lemmatize with word frequency filtering
def tokenize_and_lemmatize(text, word_frequencies):
    words = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]
    filtered_tokens = [word if word_frequencies.get(word, 0) >= word_threshold else 'xxxxx' for word in lemmatized_tokens]
    return ' '.join(filtered_tokens)

def preprocess_batch(batch, word_frequencies):
    # Apply tokenization and lemmatization with word filtering
    batch['utterance'] = batch['utterance'].apply(lambda x: tokenize_and_lemmatize(x, word_frequencies))
    batch['is_question'] = batch['utterance'].str.contains(r'\?').astype(int)
    return batch

# Main preprocessing function
def preprocess_dataframe(dataframe, batch_size=1000):
    # Apply initial preprocessing
    dataframe = preprocess_text(dataframe)
    
    # Calculate word frequencies
    all_words = ' '.join(dataframe['utterance']).split()
    word_frequencies = Counter(all_words)
    
    # Initialize progress bar
    progress = tqdm(total=len(dataframe), desc="Processing batches")
    
    processed_batches = []
    for start_row in range(0, len(dataframe), batch_size):
        end_row = start_row + batch_size
        batch = dataframe.iloc[start_row:end_row]
        processed_batch = preprocess_batch(batch, word_frequencies)
        processed_batches.append(processed_batch)
        
        # Update progress
        progress.update(len(batch))
        
    progress.close()  # Ensure the progress bar is closed after processing
    
    return pd.concat(processed_batches)

# Load the DataFrame
df = pd.read_csv('archive/utterances-2sp.csv')

# Process the DataFrame
df_processed = preprocess_dataframe(df)

# select rows with df['episode_order'] == 1 and df['turn_order'] == 1, turn the first word of the utterance into 'yyyyy'
df_processed.loc[(df_processed['episode_order'] == 1) & (df_processed['turn_order'] == 0), 'utterance'] = df_processed.loc[(df_processed['episode_order'] == 1) & (df_processed['turn_order'] == 0), 'utterance'].apply(lambda x: 'yyyyy ' + ' '.join(x.split()[1:]))


# Save the processed DataFrame
df_processed.to_csv('archive/processed_utterances-2sp.csv', index=False)


Processing batches:   0%|          | 0/1240112 [00:00<?, ?it/s]

In [3]:
# Split each utterance into a list of words and explode the DataFrame to get a row per word
words_series = df_processed['utterance'].str.split().explode()

# Convert the series to a set to get unique words
unique_words = set(words_series)

# get word frequencies
word_frequencies = words_series.value_counts().to_dict()

In [4]:
print(f'Number of unique words: {len(unique_words)}')

# sort the words by frequency and display the most common words
sorted_word_frequencies = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
sorted_word_frequencies[:10]

Number of unique words: 25371


[('the', 941774),
 ('and', 543561),
 ('to', 524992),
 ('a', 523793),
 ('of', 488340),
 ('that', 392101),
 ('in', 352344),
 ('it', 285590),
 ('you', 275047),
 ('is', 251901)]

# B

In [16]:
df_processed = pd.read_csv('archive/processed_utterances-2sp.csv')
episodes = df_processed['episode'].unique()

# Split the filtered episodes into training and testing sets
train_episodes = np.random.choice(episodes, int(0.8 * len(episodes)), replace=False)
train = df_processed[df_processed['episode'].isin(train_episodes)]
test = df_processed[~df_processed['episode'].isin(train_episodes)]

train_host = train[train['is_host'] == True]
train_guest = train[train['is_host'] == False]

In [9]:
# select a random test episode
test_episode = np.random.choice(test['episode'].unique())
test_episode_df = test[test['episode'] == test_episode]

# get all of the words in the test episode
test_episode_words = ' '.join(test_episode_df['utterance']).split()

# get the index of the words from unique_words
word_indices = {word: i for i, word in enumerate(unique_words)}

# get the indices of the words in the test episode
test_episode_word_indices = [word_indices[word] for word in test_episode_words]

# save the test episode word indices
np.save('archive/test_episode_word_indices.npy', test_episode_word_indices)

# get the in

In [33]:
def get_transition_matrix_efficient(df, word_to_index, d):
    # Initialize the transition counts matrix
    transition_counts = np.zeros((d, d), dtype=int)
    
    # Iterate over each utterance
    for utterance in df['utterance']:
        if type(utterance) != str:
            try:
                utterance = str(utterance)
            except:
                continue

        words = utterance.split()
        for i in range(1, len(words)):
            word1 = words[i - 1]
            word2 = words[i]
            if word1 in word_to_index and word2 in word_to_index:  # Check if both words are in the index
                index1 = word_to_index[word1]
                index2 = word_to_index[word2]
                transition_counts[index2, index1] += 1  # Increment the count of the transition
                
    # Handle columns with zero sum to avoid division by zero
    column_sums = transition_counts.sum(axis=0, keepdims=True)
    zero_columns = column_sums == 0
    column_sums[zero_columns] = 1  # Avoid division by zero by setting 0 sums to 1
    
    # Now make column stochastic
    transition_probabilities = transition_counts / column_sums

    # make 0 columns have 1 / d probability for each word
    transition_probabilities[:, zero_columns.flatten()] = 1 / d
    
    return transition_probabilities, word_to_index

# Apply the efficient function to get the transition matrices
transition_matrix_host, word_to_index_host = get_transition_matrix_efficient(train_host, word_indices, len(unique_words))
transition_matrix_guest, word_to_index_guest = get_transition_matrix_efficient(train_guest, word_indices, len(unique_words))

In [34]:
transition_matrix_host.shape, transition_matrix_guest.shape

# test if columns sum to 1
print(np.allclose(transition_matrix_host.sum(axis=0), 1))

# # save the transition matrices
np.save('vector-data/transition_matrix_host.npy', transition_matrix_host)
np.save('vector-data/transition_matrix_guest.npy', transition_matrix_guest)

True


# A

In [40]:
def tensor_trouble(df):
    # Unique words and mapping
    all_words = df['utterance'].str.split().explode()
    word_to_index = {word: index for index, word in enumerate(all_words)}
    unique_words = all_words.unique()
    d = len(unique_words)
    
    # Map each unique word to an index
    word_to_index = {word: index for index, word in enumerate(unique_words)}
    
    # Initialize the tensor: 2x2xd
    tensor = np.zeros((d, 2, 2), dtype=int)
    
    # Group by episodes and concatenate text with speaker roles
    for episode, group in df.groupby('episode'):
        # Flatten all text into one string per episode
        if type(group['utterance']) != str:
            group['utterance'] = group['utterance'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

        text = " ".join(group['utterance'].astype(str))
        words = text.split()

        # Replicate 'is_host' values for each word in the utterance
        roles = np.concatenate(group.apply(lambda row: [row['is_host']] * len(str(row['utterance']).split()), axis=1).values)
        
        for i in range(2, len(words)):
            # Get the current and next words
            word = words[i - 2]
            next_role = roles[i - 1]
            following_role = roles[i]

            # Get the current and next indices
            current_index = word_to_index[word]
            current_role_index = 0 if next_role else 1
            next_role_index = 0 if following_role else 1


            
            # Skip if any word is not in the index (unlikely given preprocessing, but safe practice)
            if current_index == -1:
                continue
            
            # Update the tensor based on speaker transitions
            tensor[current_index, next_role_index,current_role_index] += 1

    return tensor, word_to_index


tensor, word_to_index = tensor_trouble(train)

In [43]:
# normalize the tensor to be column stochastic for each layer
tensor_normalized = tensor / tensor.sum(axis=1, keepdims=True)

# check if the tensor is column stochastic
print(np.allclose(tensor_normalized.sum(axis=1), 1))

# save the tensor
np.save('vector-data/tensor.npy', tensor_normalized)


# view the first layer of the tensor
tensor_normalized[200]

False


array([[0.97142857, 0.00542005],
       [0.02857143, 0.99457995]])

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

df_processed = pd.read_csv('archive/processed_utterances-2sp.csv')

# Replace NaN values in 'utterance' with an empty string
df_processed['utterance'].fillna("", inplace=True)

# Proceed with TF-IDF embedding
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_processed['utterance'])

# Step 2: Applying PCA for dimensionality reduction
# Standardizing the data before PCA
scaler = StandardScaler(with_mean=False)  # Use with_mean=False for sparse matrix compatibility
tfidf_matrix_std = scaler.fit_transform(tfidf_matrix)

pca = PCA(n_components=0.55)  # Keep 95% of the variance
pca_result = pca.fit_transform(tfidf_matrix_std.toarray())

# Step 3: Clustering the PCA-reduced episode vectors
kmeans = KMeans(n_clusters=5, random_state=42)  # Example: 5 clusters, adjust based on your analysis
clusters = kmeans.fit_predict(pca_result)

# Optionally, visualize the clustering (if 2 PCA components)
if pca_result.shape[1] == 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis', marker='o')
    plt.title('PCA-reduced TF-IDF Clustering')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

# Adding cluster labels to your DataFrame
df_processed['cluster'] = clusters

# Save the processed DataFrame with cluster labels
df_processed.to_csv('archive/clustered_utterances-2sp.csv', index=False)


: 