In [1]:
# pip install pandas numpy scipy scikit-learn spacy gensim nltk

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix, save_npz
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
import gensim.downloader as api

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load spaCy English model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 1.6 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-50")

In [5]:
# Define the list of datasets to process
datasets = [
    "01.1_df_word_token_stopwords_lemmatize.csv",
    "01.2_df_subword_token_stopwords_lemmatize.csv",
    "01.3_df_sentence_token_stopwords_lemmatize.csv",
    "01.4_df_bert_token_stopwords_lemmatize.csv",
    "01.5_df_tiktoken_token_stopwords_lemmatize.csv",
    "01.6_df_whitespace_token_stopwords_lemmatize.csv"
]

In [6]:
# Define column groups for text processing
short_text_columns = [
    "api_usage_pre", "tableau_usage_pre", "ml_application_pre", "persona_explanation_pre",
    "tools_usage_post", "api_usage_post", "ml_application_post"
]
long_text_columns = [
    "data_collection_explanation_post", "data_analysis_explanation_post",
    "persona_building_explanation_post", "evaluation_explanation_post"
]

In [7]:
# Lists to track processing results
saved_files = []
failed_files = []

In [8]:
def vectorize_sentence_with_glove(sentence, model, vector_size=50):
    """Convert a tokenized sentence into a GloVe word embedding vector."""
    words = [token.text for token in nlp(sentence) if token.text in model]
    if not words:
        return np.zeros(vector_size)  # Return zero vector if no known words are found
    return np.mean([model[word] for word in words], axis=0)

In [9]:
def process_survey(file_name, use_count_vectorizer=False, save=True):
    """Vectorizes a survey dataset using TF-IDF (or CountVectorizer) and GloVe embeddings, and saves output."""
    try:
        # Ensure file exists before proceeding
        if not os.path.exists(file_name):
            raise FileNotFoundError(f"File not found: {file_name}")

        df = pd.read_csv(file_name)
        print(f"Processing survey: {file_name}")

        # Ensure required columns exist
        missing_cols = [col for col in short_text_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing short_text_columns: {missing_cols}")

        missing_cols = [col for col in long_text_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing long_text_columns: {missing_cols}")

        # Use either TF-IDF or CountVectorizer based on parameter
        if use_count_vectorizer:
            vectorizer = CountVectorizer()
        else:
            vectorizer = TfidfVectorizer(norm=None)  # Prevents negative values

        short_response_vectors = vectorizer.fit_transform(
            df[short_text_columns].fillna("").agg(" ".join, axis=1)
        )

        # GloVe Embeddings for long responses
        df["glove_vector"] = df[long_text_columns].fillna("").agg(" ".join, axis=1).apply(
            lambda x: vectorize_sentence_with_glove(x, glove_model)
        )

        # Convert GloVe vectors to sparse format
        glove_vectors = csr_matrix(np.vstack(df["glove_vector"]))

        # Stack both feature matrices
        final_feature_matrix = hstack([short_response_vectors, glove_vectors])

        print(f"Vectorization complete for: {file_name}, Shape: {final_feature_matrix.shape}")

        # Ensure the final matrix is valid before saving
        if final_feature_matrix.shape[0] == 0 or final_feature_matrix.shape[1] == 0:
            raise ValueError(f"Vectorized matrix is empty for {file_name}")

        # Save vectorized output in the same directory as the original file
        if save:
            output_file = file_name.replace(".csv", "_vectorized.npz")
            save_npz(output_file, final_feature_matrix)
            saved_files.append(output_file)
            print(f"Saved vectorized features to: {output_file}")

        return final_feature_matrix

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        failed_files.append(file_name)
        return None

In [10]:
# Process all datasets
for dataset in datasets:
    process_survey(dataset)

Processing survey: 01.1_df_word_token_stopwords_lemmatize.csv
Vectorization complete for: 01.1_df_word_token_stopwords_lemmatize.csv, Shape: (26, 339)
Saved vectorized features to: 01.1_df_word_token_stopwords_lemmatize_vectorized.npz
Processing survey: 01.2_df_subword_token_stopwords_lemmatize.csv
Vectorization complete for: 01.2_df_subword_token_stopwords_lemmatize.csv, Shape: (26, 579)
Saved vectorized features to: 01.2_df_subword_token_stopwords_lemmatize_vectorized.npz
Processing survey: 01.3_df_sentence_token_stopwords_lemmatize.csv
Vectorization complete for: 01.3_df_sentence_token_stopwords_lemmatize.csv, Shape: (26, 348)
Saved vectorized features to: 01.3_df_sentence_token_stopwords_lemmatize_vectorized.npz
Processing survey: 01.4_df_bert_token_stopwords_lemmatize.csv
Vectorization complete for: 01.4_df_bert_token_stopwords_lemmatize.csv, Shape: (26, 360)
Saved vectorized features to: 01.4_df_bert_token_stopwords_lemmatize_vectorized.npz
Processing survey: 01.5_df_tiktoken_tok

In [11]:
# Function to print summary of saved files
def print_summary():
    print("\n--- Processing Summary ---")
    print(f"Total files processed: {len(saved_files) + len(failed_files)}")
    print(f"Successfully saved: {len(saved_files)} files")
    for file in saved_files:
        print(f"   - {file}")
    if failed_files:
        print(f"\nFailed to process {len(failed_files)} files:")
        for file in failed_files:
            print(f"   - {file}")

# Print the final summary
print_summary()


--- Processing Summary ---
Total files processed: 6
Successfully saved: 6 files
   - 01.1_df_word_token_stopwords_lemmatize_vectorized.npz
   - 01.2_df_subword_token_stopwords_lemmatize_vectorized.npz
   - 01.3_df_sentence_token_stopwords_lemmatize_vectorized.npz
   - 01.4_df_bert_token_stopwords_lemmatize_vectorized.npz
   - 01.5_df_tiktoken_token_stopwords_lemmatize_vectorized.npz
   - 01.6_df_whitespace_token_stopwords_lemmatize_vectorized.npz
