In [8]:
# pip install pandas numpy scipy scikit-learn spacy gensim nltk

In [9]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
import gensim.downloader as api

In [10]:
# Load spaCy English model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 11.7 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-50")

In [12]:
# Column groups
short_text_columns = [
    "api_usage_pre", "tableau_usage_pre", "ml_application_pre", "persona_explanation_pre",
    "tools_usage_post", "api_usage_post", "ml_application_post"
]
long_text_columns = [
    "data_collection_explanation_post", "data_analysis_explanation_post",
    "persona_building_explanation_post", "evaluation_explanation_post"
]

In [13]:
def vectorize_sentence_with_glove(sentence, model, vector_size=50):
    """Convert a tokenized sentence into a GloVe word embedding vector."""
    words = [token.text for token in nlp(sentence) if token.text in model]
    if not words:
        return np.zeros(vector_size)  # Return zero vector if no known words are found
    return np.mean([model[word] for word in words], axis=0)

In [14]:
def process_survey(file_name, use_count_vectorizer=False, save=True):
    """Vectorizes a survey dataset using TF-IDF (or CountVectorizer) and GloVe embeddings, and saves output."""
    try:
        df = pd.read_csv(file_name)
        print(f"Processing survey: {file_name}")

        # Ensure required columns exist
        missing_cols = [col for col in short_text_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing short_text_columns: {missing_cols}")

        missing_cols = [col for col in long_text_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing long_text_columns: {missing_cols}")

        # Use either TF-IDF or CountVectorizer based on parameter
        if use_count_vectorizer:
            vectorizer = CountVectorizer()
        else:
            vectorizer = TfidfVectorizer(norm=None)  # Prevents negative values

        short_response_vectors = vectorizer.fit_transform(
            df[short_text_columns].fillna("").agg(" ".join, axis=1)
        )

        # GloVe Embeddings for long responses
        df["glove_vector"] = df[long_text_columns].fillna("").agg(" ".join, axis=1).apply(
            lambda x: vectorize_sentence_with_glove(x, glove_model)
        )

        # Convert GloVe vectors to sparse format
        glove_vectors = csr_matrix(np.vstack(df["glove_vector"]))

        # Stack both feature matrices
        final_feature_matrix = hstack([short_response_vectors, glove_vectors])

        print(f"Vectorization complete for: {file_name}, Shape: {final_feature_matrix.shape}")

        # ✅ Save vectorized output if enabled
        if save:
            output_file = file_name.replace(".csv", "_vectorized.npz")
            save_npz(output_file, final_feature_matrix)
            print(f"Saved vectorized features to: {output_file}")

        return final_feature_matrix

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None