In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import os
import warnings
from scipy.sparse import vstack
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import spacy
import re
# Load the spaCy model for English. This needs to be installed via spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

warnings.filterwarnings("ignore")
from collections import Counter

In [60]:
from tqdm.notebook import tqdm
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

word_threshold = 20

# Preprocessing function to be applied to the entire DataFrame
def preprocess_text(df):
    # Normalize text: replace dashes with spaces, remove non-alphabetic characters except spaces
    df['utterance'] = df['utterance'].str.replace('-', ' ').str.lower()
    df['utterance'] = df['utterance'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
    return df

# Function to tokenize and lemmatize with word frequency filtering
def tokenize_and_lemmatize(text, word_frequencies):
    words = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]
    filtered_tokens = [word if word_frequencies.get(word, 0) >= word_threshold else 'xxxxx' for word in lemmatized_tokens]
    return ' '.join(filtered_tokens)

def preprocess_batch(batch, word_frequencies):
    # Apply tokenization and lemmatization with word filtering
    batch['utterance'] = batch['utterance'].apply(lambda x: tokenize_and_lemmatize(x, word_frequencies))
    batch['is_question'] = batch['utterance'].str.contains(r'\?').astype(int)
    return batch

# Main preprocessing function
def preprocess_dataframe(dataframe, batch_size=1000):
    # Apply initial preprocessing
    dataframe = preprocess_text(dataframe)
    
    # Calculate word frequencies
    all_words = ' '.join(dataframe['utterance']).split()
    word_frequencies = Counter(all_words)
    
    # Initialize progress bar
    progress = tqdm(total=len(dataframe), desc="Processing batches")
    
    processed_batches = []
    for start_row in range(0, len(dataframe), batch_size):
        end_row = start_row + batch_size
        batch = dataframe.iloc[start_row:end_row]
        processed_batch = preprocess_batch(batch, word_frequencies)
        processed_batches.append(processed_batch)
        
        # Update progress
        progress.update(len(batch))
        
    progress.close()  # Ensure the progress bar is closed after processing
    
    return pd.concat(processed_batches)

# Load the DataFrame
df = pd.read_csv('archive/utterances-2sp.csv')

# Process the DataFrame
df_processed = preprocess_dataframe(df)

# Save the processed DataFrame
df_processed.to_csv('archive/processed_utterances-2sp.csv', index=False)


Processing batches:   0%|          | 0/1240112 [00:00<?, ?it/s]

In [63]:
# Split each utterance into a list of words and explode the DataFrame to get a row per word
words_series = df_processed['utterance'].str.split().explode()

# Convert the series to a set to get unique words
unique_words = set(words_series)

# get word frequencies
word_frequencies = words_series.value_counts().to_dict()

In [64]:
print(f'Number of unique words: {len(unique_words)}')

# sort the words by frequency and display the most common words
sorted_word_frequencies = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
sorted_word_frequencies[:10]

Number of unique words: 17549


[('the', 943818),
 ('and', 544649),
 ('to', 525141),
 ('a', 524645),
 ('of', 488369),
 ('xxxxx', 444125),
 ('that', 392136),
 ('in', 353176),
 ('it', 286517),
 ('you', 275173)]