In [5]:
# import libraries
import os
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import spacy

In [6]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")


In [None]:
# List of survey files to process
survey_files = [
    "dataset_bert_token_stopwords_lemmatize.csv",
    "dataset_sentence_token_stopwords_lemmatize.csv",
    "dataset_subword_token_stopwords_lemmatize.csv",
    "dataset_tiktoken_token_stopwords_lemmatize.csv",
    "dataset_whitespace_token_stopwords_lemmatize.csv",
    "dataset_word_token_stopwords_lemmatize.csv",
]

In [8]:
# based on answers length
# we splitted them in two lists
short_text_columns = ["api_usage_pre", "tableau_usage_pre", "ml_application_pre", "persona_explanation_pre", "tools_usage_post", "api_usage_post", "ml_application_post", ]
long_text_columns = ["data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post", "evaluation_explanation_post",]

In [9]:
# Load pre-trained GloVe model (50-dimensional vectors)
glove_model = api.load("glove-wiki-gigaword-50")

In [10]:
# Dictionary to store vectorized data for each survey
vectorized_surveys = {}

In [11]:
def vectorize_sentence_with_glove(sentence, model, vector_size=50):
    """
    Convert a tokenized sentence into a GloVe word embedding vector.
    If a word is not in GloVe, it is ignored.
    """
    words = [token.text for token in nlp(sentence) if token.text in model]
    if not words:
        return np.zeros(vector_size)  # Return zero vector if no known words are found
    return np.mean([model[word] for word in words], axis=0)

In [12]:
def process_survey(file_name):
    """
    Vectorizes a single survey dataset.
    - Uses TF-IDF for short responses
    - Uses GloVe embeddings for open-ended responses
    - Combines both into a single feature matrix
    """
    try:
        df = pd.read_csv(file_name)
        print(f"Processing survey: {file_name}")

        tfidf_vectorizer = TfidfVectorizer()
        short_response_vectors = tfidf_vectorizer.fit_transform(df[short_text_columns].fillna("").agg(" ".join, axis=1))

        df["glove_vector"] = df[long_text_columns].fillna("").agg(" ".join, axis=1).apply(lambda x: vectorize_sentence_with_glove(x, glove_model, 50))

        glove_vectors = np.vstack(df["glove_vector"])

        final_feature_matrix = hstack([short_response_vectors, glove_vectors])

        print(f"Vectorization complete for: {file_name}, Shape: {final_feature_matrix.shape}")

        return final_feature_matrix

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None


In [13]:
# Process each survey file
for survey_file in survey_files:
    vectorized_surveys[survey_file] = process_survey(survey_file)

Processing survey: dataset_bert_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_bert_token_stopwords_lemmatize.csv, Shape: (26, 360)
Processing survey: dataset_sentence_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_sentence_token_stopwords_lemmatize.csv, Shape: (26, 348)
Processing survey: dataset_subword_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_subword_token_stopwords_lemmatize.csv, Shape: (26, 579)
Processing survey: dataset_tiktoken_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_tiktoken_token_stopwords_lemmatize.csv, Shape: (26, 726)
Processing survey: dataset_whitespace_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_whitespace_token_stopwords_lemmatize.csv, Shape: (26, 341)
Processing survey: dataset_word_token_stopwords_lemmatize.csv
Vectorization complete for: dataset_word_token_stopwords_lemmatize.csv, Shape: (26, 339)


In [14]:
# Save vectorized data for each survey
for survey_name, feature_matrix in vectorized_surveys.items():
    if feature_matrix is not None:
        output_file = survey_name.replace(".csv", "_vectorized.npz")
        np.savez_compressed(output_file, feature_matrix=feature_matrix)
        print(f"Vectorized data saved: {output_file}")

Vectorized data saved: dataset_bert_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: dataset_sentence_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: dataset_subword_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: dataset_tiktoken_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: dataset_whitespace_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: dataset_word_token_stopwords_lemmatize_vectorized.npz
