# Subword Tokenization


In [9]:
# import libraries

# pip install tokenizers

import spacy
import subprocess
import pandas as pd
from tokenizers import ByteLevelBPETokenizer

In [10]:
# Load dataset
file_path = "all_numeric_survey.csv" 
text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [11]:
try:
    df = pd.read_csv(file_path)  # Load CSV file
    print("Dataset loaded successfully!")

    # Initialize and train the BPE tokenizer on dataset text
    tokenizer = ByteLevelBPETokenizer()
    
    # Combine all text from the selected columns for training
    all_text = df[text_columns].astype(str).values.flatten()
    tokenizer.train_from_iterator(all_text)

    # Function to apply BPE tokenization
    def tokenize_text(text):
        return " ".join(tokenizer.encode(text).tokens) if pd.notna(text) else ""

    # Apply BPE tokenization to each text column
    for col in text_columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(tokenize_text)

    # Save the processed dataset
    output_file = "dataset_subword_token.csv"
    df.to_csv(output_file, index=False)
    print(f"Processed dataset saved as {output_file}")

except Exception as e:
    print(f"Error loading dataset: {e}")


Dataset loaded successfully!
Processed dataset saved as dataset_subword_token.csv


In [12]:
# Display the cleaned dataset
# Ensure all columns are displayed
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,start_date_pre,end_date_pre,ip_address_pre,duration_sec_pre,response_id_pre,LocationLatitude_pre,LocationLongitude_pre,DistributionChannel_pre,UserLanguage_pre,participant_id,age_pre,gender_pre,occupation_pre,teaching_marketing_pre,teaching_experience_pre,learning_style_pre,learning_format_pre,interaction_preference_pre,trusted_learning_method_pre,ai_familiarity_pre,ddp_familiarity_pre,data_sources_pre,persona_definition_pre,interactive_persona_pre,data_driven_persona_pre,dynamic_persona_pre,tableau_usage_pre,api_usage_pre,ml_application_pre,persona_explanation_pre,confirmation_pre,start_date_post,end_date_post,ip_address_post,duration_sec_post,response_id_post,LocationLatitude_post,LocationLongitude_post,DistributionChannel_post,UserLanguage_post,data_collection_explanation_post,data_analysis_explanation_post,persona_building_explanation_post,evaluation_explanation_post,ai_familiarity_post,ddp_familiarity_post,data_sources_post,persona_definition_post,interactive_persona_post,data_driven_persona_post,dynamic_persona_post,tools_usage_post,api_usage_post,ml_application_post,engagement_experience_post,interaction_quality_post,communication_clarity_post,trustworthiness_post,emotional_response_post,naturalness_post,effectiveness_post,comfort_level_post,personalization_post,mental_effort_post
0,2/13/25 0:10,2/13/25 0:16,193.166.113.18,372,R_8GBoA0i1k6yogS1,63.1198,21.6798,0,0,1,60,1,1,1,20,3,34,34,234,12,10,4,1,2,2,1,I Ġdo Ġnot Ġknow,I Ġ Ä ł d o Ġ Ä ł not Ġ Ä ł k now,I Ġ Ä ł d o Ġ Ä ł not Ġ Ä ł k now,I Ġdo Ġnot Ġknow,0,2025-02-13 0:56:11,2025-02-13 1:18:59,193.166.113.18,1368,R_8QL21bpDcdbdtJ4,63.1198,21.6798,0,0,"to Ġcollect Ġstud en s ' Ġdemographic Ġdata , ...",First Ġthe Ġdata Ġmust Ġbe Ġcle an ed . Ġto Ġa...,"After Ġclustering Ġand Ġcombining , Ġthe Ġpers...","The Ġevaluation Ġcan Ġbe Ġdone Ġe . g ., Ġwith...",4,4,All of the above,0,1,0,0,To Ġcommunicate Ġthe Ġpersonas Ġto Ġthe Ġusers...,B ec ause Ġrelevant Ġdata Ġcan Ġbe Ġobtained Ġ...,"They Ġcan Ġbe Ġused Ġe . g ., Ġin Ġclustering ...",5,1,7,2,2,1,6,6,1,4
1,2/13/25 0:09,2/13/25 0:16,193.166.113.32,409,R_8QG2UV0Zv0fKVB5,63.1198,21.6798,0,0,23,41,1,2,2,0,134,24,34,234,10,1,4,1,2,2,1,"I Ġdo Ġnot Ġknow Ġthese Ġtools , Ġso ĠI Ġcanno...",I Ġ Ä ł ha ve Ġ Ä ł no Ġ Ä ł c l ue Ġ . Ġ Ä ł,I Ġ Ä ł ha ve Ġ Ä ł no Ġ Ä ł c l ue Ġ .,I Ġhave Ġno Ġclue .,0,2025-02-13 0:56:02,2025-02-13 1:13:21,193.166.113.32,1039,R_8QrAaCxxXwbpxvL,63.1198,21.6798,0,0,"User Ġanalytics : ĠGoogle Ġanalytics , ĠCRM , ...",Segment ation Ġcan Ġbe Ġperform ed Ġso Ġthat Ġ...,One Ġway Ġto Ġcreate Ġstudent Ġpersonas Ġit Ġi...,1 ) ĠI Ġhave Ġnot Ġbeen Ġteaching Ġmarketing Ġ...,2,2,All of the above,0,1,0,0,To Ġillustr ate Ġ( and Ġupdate ) Ġstatic Ġpers...,I Ġhave Ġno Ġclue Ġwhat Ġis ĠAPI .,To Ġcreate Ġdynamic Ġpersonas . Ġ,4,2,4,4,2,4,4,2,2,6
2,2/13/25 0:09,2/13/25 0:17,193.166.113.31,462,R_8tHW5RDZd4yBkcx,63.1198,21.6798,0,0,22,45,1,2,1,3,13,34,34,234,11,10,13,1,5,2,5,I Ġdon 't Ġknow .,I Ġ Ä ł don Ġ ' t Ġ Ä ł k now Ġ Ä ł w hat Ġ Ä ...,I Ġ Ä ł don Ġ ' t Ġ Ä ł k now Ġ .,I Ġdon 't Ġknow .,0,2025-02-13 10:06:34,2025-02-13 10:19:38,193.166.113.31,784,R_2kRKZFGiWHUQidw,63.1198,21.6798,0,0,"I Ġneed Ġquantitative Ġdata , Ġwhich Ġcan Ġbe ...",S t at istical Ġanalyses Ġare Ġre qu ire d . Ġ...,"In Ġthis Ġstage , Ġcr af ting Ġthe Ġpersona Ġp...",This Ġis Ġa Ġcr u cial Ġstep Ġin Ġthe Ġpersona...,3,4,All of the above,0,2,0,1,To Ġvisual ise Ġthe Ġpersona .,I Ġdon t Ġknow Ġwhat ĠAPI Ġstands Ġfor .,I Ġdon 't Ġknow .,3,4,6,6,1,4,5,3,4,5
3,2/13/25 0:11,2/13/25 0:17,193.166.113.21,355,R_8462bTis2cTNcii,63.1198,21.6798,0,0,19,39,1,2,1,1,1234,34,34,234,12,1,4,1,5,2,5,,,,,0,2025-02-13 0:55:51,2025-02-13 1:11:01,193.166.113.21,909,R_2M3diy3SR17ual3,63.1198,21.6798,0,0,"Data Ġsources : ĠA t t end ance , Ġgroup Ġtask...","C lustering , Ġmat rix Ġfor Ġpatterns , ĠAI Ġo...",use Ġthe Ġdata Ġand Ġbuild Ġrepresentative Ġpe...,test Ġwith in Ġthe Ġexisting Ġn et wor k Ġto Ġ...,4,5,All of the above,0,1,0,0,data Ġanalysis Ġ,to Ġcollect Ġdata Ġ,To Ġa Ġgre at Ġde al . Ġcan Ġhelp Ġprocess Ġla...,6,6,6,4,2,3,6,1,4,4
4,2/13/25 0:09,2/13/25 0:18,193.166.117.7,551,R_824J8ZB9ZGfOu8t,63.1198,21.6798,0,0,15,38,2,2,2,0,124,4,34,2,12,1,5,5,5,5,5,-,-,-,-,0,2025-02-13 0:55:35,2025-02-13 1:19:16,193.166.117.7,1421,R_2KPeTVjwHtRdvU7,63.1198,21.6798,0,0,"- age , Ġjobs , Ġdemographic Ġdata , Ġhow Ġthe...",- c lustering Ġstudents Ġaccording Ġto Ġtheir ...,- ma king Ġdescriptive Ġprofile Ġposters Ġfor ...,- p ersonas Ġshould Ġnot Ġbecome Ġstatic Ġbut ...,4,2,All of the above,0,1,0,0,To Ġmake Ġposters Ġof Ġthe Ġpersona Ġso Ġthat ...,For Ġanalysis ? ĠI Ġdon 't Ġknow Ġwhat Ġis ĠAPI .,To Ġdevelop Ġpersonas Ġfurther Ġwhen Ġnew Ġinf...,4,2,3,3,1,4,3,6,1,6
