# BERT Tokenization


In [28]:
# pip install transformers
import spacy
import subprocess
import pandas as pd
from transformers import BertTokenizer

In [29]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def bert_tokenize(text):
    """
    Tokenize text using BERT's WordPiece tokenizer.
    :param text: str, input text
    :return: list, tokenized subwords
    """
    return tokenizer.tokenize(text)

In [30]:
# Load dataset
file_path = "all_numeric_survey.csv" 
text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [31]:
try:
    df = pd.read_csv(file_path)  # Load CSV file
    print("Dataset loaded successfully!")

    # Apply lemmatization to each text column
    for col in text_columns:
        if col in df.columns:  # Ensure column exists before processing
            df[col] = df[col].astype(str).apply(bert_tokenize)

    # Save the processed dataset
    output_file = "dataset_bert_token.csv"
    df.to_csv(output_file, index=False)
    print(f"Processed dataset saved as {output_file}")

except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset loaded successfully!
Processed dataset saved as dataset_bert_token.csv


In [32]:
# Display the cleaned dataset
# Ensure all columns are displayed
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,start_date_pre,end_date_pre,ip_address_pre,duration_sec_pre,response_id_pre,LocationLatitude_pre,LocationLongitude_pre,DistributionChannel_pre,UserLanguage_pre,participant_id,age_pre,gender_pre,occupation_pre,teaching_marketing_pre,teaching_experience_pre,learning_style_pre,learning_format_pre,interaction_preference_pre,trusted_learning_method_pre,ai_familiarity_pre,ddp_familiarity_pre,data_sources_pre,persona_definition_pre,interactive_persona_pre,data_driven_persona_pre,dynamic_persona_pre,tableau_usage_pre,api_usage_pre,ml_application_pre,persona_explanation_pre,confirmation_pre,start_date_post,end_date_post,ip_address_post,duration_sec_post,response_id_post,LocationLatitude_post,LocationLongitude_post,DistributionChannel_post,UserLanguage_post,data_collection_explanation_post,data_analysis_explanation_post,persona_building_explanation_post,evaluation_explanation_post,ai_familiarity_post,ddp_familiarity_post,data_sources_post,persona_definition_post,interactive_persona_post,data_driven_persona_post,dynamic_persona_post,tools_usage_post,api_usage_post,ml_application_post,engagement_experience_post,interaction_quality_post,communication_clarity_post,trustworthiness_post,emotional_response_post,naturalness_post,effectiveness_post,comfort_level_post,personalization_post,mental_effort_post
0,2/13/25 0:10,2/13/25 0:16,193.166.113.18,372,R_8GBoA0i1k6yogS1,63.1198,21.6798,0,0,1,60,1,1,1,20,3,34,34,234,12,10,4,1,2,2,1,"[i, do, not, know]","[[, ', i, ', ,, ', do, ', ,, ', not, ', ,, ', ...","[[, ', i, ', ,, ', do, ', ,, ', not, ', ,, ', ...","[i, do, not, know]",0,2025-02-13 0:56:11,2025-02-13 1:18:59,193.166.113.18,1368,R_8QL21bpDcdbdtJ4,63.1198,21.6798,0,0,"[to, collect, stud, ##ens, ', demographic, dat...","[first, the, data, must, be, cleaned, ., to, a...","[after, cluster, ##ing, and, combining, ,, the...","[the, evaluation, can, be, done, e, ., g, ., ,...",4,4,All of the above,0,1,0,0,"[to, communicate, the, persona, ##s, to, the, ...","[because, relevant, data, can, be, obtained, f...","[they, can, be, used, e, ., g, ., ,, in, clust...",5,1,7,2,2,1,6,6,1,4
1,2/13/25 0:09,2/13/25 0:16,193.166.113.32,409,R_8QG2UV0Zv0fKVB5,63.1198,21.6798,0,0,23,41,1,2,2,0,134,24,34,234,10,1,4,1,2,2,1,"[i, do, not, know, these, tools, ,, so, i, can...","[[, ', i, ', ,, ', have, ', ,, ', no, ', ,, ',...","[[, ', i, ', ,, ', have, ', ,, ', no, ', ,, ',...","[i, have, no, clue, .]",0,2025-02-13 0:56:02,2025-02-13 1:13:21,193.166.113.32,1039,R_8QrAaCxxXwbpxvL,63.1198,21.6798,0,0,"[user, analytics, :, google, analytics, ,, cr,...","[segment, ##ation, can, be, performed, so, tha...","[one, way, to, create, student, persona, ##s, ...","[1, ), i, have, not, been, teaching, marketing...",2,2,All of the above,0,1,0,0,"[to, illustrate, (, and, update, ), static, pe...","[i, have, no, clue, what, is, api, .]","[to, create, dynamic, persona, ##s, .]",4,2,4,4,2,4,4,2,2,6
2,2/13/25 0:09,2/13/25 0:17,193.166.113.31,462,R_8tHW5RDZd4yBkcx,63.1198,21.6798,0,0,22,45,1,2,1,3,13,34,34,234,11,10,13,1,5,2,5,"[i, don, ', t, know, .]","[[, ', i, ', ,, ', don, ', ,, "", ', "", ,, ', t...","[[, ', i, ', ,, ', don, ', ,, "", ', "", ,, ', t...","[i, don, ', t, know, .]",0,2025-02-13 10:06:34,2025-02-13 10:19:38,193.166.113.31,784,R_2kRKZFGiWHUQidw,63.1198,21.6798,0,0,"[i, need, quantitative, data, ,, which, can, b...","[statistical, analyses, are, required, ., thes...","[in, this, stage, ,, craft, ##ing, the, person...","[this, is, a, crucial, step, in, the, persona,...",3,4,All of the above,0,2,0,1,"[to, visual, ##ise, the, persona, .]","[i, don, ##t, know, what, api, stands, for, .]","[i, don, ', t, know, .]",3,4,6,6,1,4,5,3,4,5
3,2/13/25 0:11,2/13/25 0:17,193.166.113.21,355,R_8462bTis2cTNcii,63.1198,21.6798,0,0,19,39,1,2,1,1,1234,34,34,234,12,1,4,1,5,2,5,[nan],"[[, ', nan, ', ]]","[[, ', nan, ', ]]",[nan],0,2025-02-13 0:55:51,2025-02-13 1:11:01,193.166.113.21,909,R_2M3diy3SR17ual3,63.1198,21.6798,0,0,"[data, sources, :, attendance, ,, group, tasks...","[cluster, ##ing, ,, matrix, for, patterns, ,, ...","[use, the, data, and, build, representative, p...","[test, within, the, existing, network, to, see...",4,5,All of the above,0,1,0,0,"[data, analysis]","[to, collect, data]","[to, a, great, deal, ., can, help, process, la...",6,6,6,4,2,3,6,1,4,4
4,2/13/25 0:09,2/13/25 0:18,193.166.117.7,551,R_824J8ZB9ZGfOu8t,63.1198,21.6798,0,0,15,38,2,2,2,0,124,4,34,2,12,1,5,5,5,5,5,[-],"[[, ', -, ', ]]","[[, ', -, ', ]]",[-],0,2025-02-13 0:55:35,2025-02-13 1:19:16,193.166.117.7,1421,R_2KPeTVjwHtRdvU7,63.1198,21.6798,0,0,"[-, age, ,, jobs, ,, demographic, data, ,, how...","[-, cluster, ##ing, students, according, to, t...","[-, making, descriptive, profile, posters, for...","[-, persona, ##s, should, not, become, static,...",4,2,All of the above,0,1,0,0,"[to, make, posters, of, the, persona, so, that...","[for, analysis, ?, i, don, ', t, know, what, i...","[to, develop, persona, ##s, further, when, new...",4,2,3,3,1,4,3,6,1,6
