In [54]:
# pip install pandas numpy scipy scikit-learn spacy gensim nltk

In [55]:
# import libraries
import os
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import spacy

In [56]:
# Load spaCy English model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 6.0 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [66]:
# List of survey files to process
survey_files = [
    "01.1_df_word_token_stopwords_lemmatize.csv",
    "01.2_df_subword_token_stopwords_lemmatize.csv",
    "01.3_df_sentence_token_stopwords_lemmatize.csv",  
    "01.4_df_bert_token_stopwords_lemmatize.csv", 
    "01.5_df_tiktoken_token_stopwords_lemmatize.csv",  
    "01.6_df_whitespace_token_stopwords_lemmatize.csv"
]

# Dictionary to store DataFrames
dataframes = {}

# Loop through each file and check if it exists
for file in survey_files:
    if os.path.exists(file):
        try:
            df = pd.read_csv(file)
            dataframes[file] = df  # Store DataFrame in dictionary
            print(f"Loaded: {file} (Rows: {df.shape[0]}, Columns: {df.shape[1]})")
        except Exception as e:
            print(f"Error loading {file}: {e}")
    else:
        print(f"File not found: {file}")

# Summary
print(f"\n Successfully loaded {len(dataframes)} out of {len(survey_files)} files.")

Loaded: 01.1_df_word_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)
Loaded: 01.2_df_subword_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)
Loaded: 01.3_df_sentence_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)
Loaded: 01.4_df_bert_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)
Loaded: 01.5_df_tiktoken_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)
Loaded: 01.6_df_whitespace_token_stopwords_lemmatize.csv (Rows: 26, Columns: 82)

 Successfully loaded 6 out of 6 files.


In [58]:
# based on answers length
# we splitted them in two lists
short_text_columns = ["api_usage_pre", "tableau_usage_pre", "ml_application_pre", "persona_explanation_pre", "tools_usage_post", "api_usage_post", "ml_application_post", ]
long_text_columns = ["data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post", "evaluation_explanation_post",]

In [59]:
# Load pre-trained GloVe model (50-dimensional vectors)
glove_model = api.load("glove-wiki-gigaword-50")

In [60]:
# Dictionary to store vectorized data for each survey
vectorized_surveys = {}

In [61]:
def vectorize_sentence_with_glove(sentence, model, vector_size=50):
    """
    Convert a tokenized sentence into a GloVe word embedding vector.
    If a word is not in GloVe, it is ignored.
    """
    words = [token.text for token in nlp(sentence) if token.text in model]
    if not words:
        return np.zeros(vector_size)  # Return zero vector if no known words are found
    return np.mean([model[word] for word in words], axis=0)

In [62]:
def process_survey(file_name):
    """
    Vectorizes a single survey dataset.
    - Uses TF-IDF for short responses
    - Uses GloVe embeddings for open-ended responses
    - Combines both into a single feature matrix
    """
    try:
        df = pd.read_csv(file_name)
        print(f"Processing survey: {file_name}")

        tfidf_vectorizer = TfidfVectorizer()
        short_response_vectors = tfidf_vectorizer.fit_transform(df[short_text_columns].fillna("").agg(" ".join, axis=1))

        df["glove_vector"] = df[long_text_columns].fillna("").agg(" ".join, axis=1).apply(lambda x: vectorize_sentence_with_glove(x, glove_model, 50))

        glove_vectors = np.vstack(df["glove_vector"])

        final_feature_matrix = hstack([short_response_vectors, glove_vectors])

        print(f"Vectorization complete for: {file_name}, Shape: {final_feature_matrix.shape}")

        return final_feature_matrix

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None


In [63]:
# Process each survey file
for survey_file in survey_files:
    vectorized_surveys[survey_file] = process_survey(survey_file)

Processing survey: 01.1_df_word_token_stopwords_lemmatize.csv
Vectorization complete for: 01.1_df_word_token_stopwords_lemmatize.csv, Shape: (26, 339)
Processing survey: 01.2_df_subword_token_stopwords_lemmatize.csv
Vectorization complete for: 01.2_df_subword_token_stopwords_lemmatize.csv, Shape: (26, 579)
Processing survey: 01.3_df_sentence_token_stopwords_lemmatize.csv
Vectorization complete for: 01.3_df_sentence_token_stopwords_lemmatize.csv, Shape: (26, 348)
Processing survey: 01.4_df_bert_token_stopwords_lemmatize.csv
Vectorization complete for: 01.4_df_bert_token_stopwords_lemmatize.csv, Shape: (26, 360)
Processing survey: 01.5_df_tiktoken_token_stopwords_lemmatize.csv
Vectorization complete for: 01.5_df_tiktoken_token_stopwords_lemmatize.csv, Shape: (26, 725)
Processing survey: 01.6_df_whitespace_token_stopwords_lemmatize.csv
Vectorization complete for: 01.6_df_whitespace_token_stopwords_lemmatize.csv, Shape: (26, 341)


In [64]:
# Save vectorized data for each survey
for survey_name, feature_matrix in vectorized_surveys.items():
    if feature_matrix is not None:
        output_file = survey_name.replace(".csv", "_vectorized.npz")
        np.savez_compressed(output_file, feature_matrix=feature_matrix)
        print(f"Vectorized data saved: {output_file}")

Vectorized data saved: 01.1_df_word_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: 01.2_df_subword_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: 01.3_df_sentence_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: 01.4_df_bert_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: 01.5_df_tiktoken_token_stopwords_lemmatize_vectorized.npz
Vectorized data saved: 01.6_df_whitespace_token_stopwords_lemmatize_vectorized.npz
