# Tiktoken (OpenAI, GPT)


In [16]:
# pip install tiktoken
import pandas as pd
import tiktoken

In [17]:
# Load the encoding used in GPT-4/GPT-3.5
enc = tiktoken.get_encoding("cl100k_base")  # OpenAI's standard tokenizer

def tiktoken_tokenize(text):
    """
    Tokenize text using OpenAI's Tiktoken.
    :param text: str, input text
    :return: list, tokenized tokens
    """
    return enc.encode(text)  # Returns token IDs (integers)

In [18]:
# Load dataset
file_path = "all_numeric_survey.csv" 
text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [19]:
try:
    df = pd.read_csv(file_path)  # Load CSV file
    print("Dataset loaded successfully!")

    # Apply lemmatization to each text column
    for col in text_columns:
        if col in df.columns:  # Ensure column exists before processing
            df[col] = df[col].astype(str).apply(tiktoken_tokenize)

    # Save the processed dataset
    output_file = "dataset_tiktoken_token.csv"
    df.to_csv(output_file, index=False)
    print(f"Processed dataset saved as {output_file}")

except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset loaded successfully!
Processed dataset saved as dataset_tiktoken_token.csv


In [20]:
# Display the cleaned dataset
# Ensure all columns are displayed
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,start_date_pre,end_date_pre,ip_address_pre,duration_sec_pre,response_id_pre,LocationLatitude_pre,LocationLongitude_pre,DistributionChannel_pre,UserLanguage_pre,participant_id,age_pre,gender_pre,occupation_pre,teaching_marketing_pre,teaching_experience_pre,learning_style_pre,learning_format_pre,interaction_preference_pre,trusted_learning_method_pre,ai_familiarity_pre,ddp_familiarity_pre,data_sources_pre,persona_definition_pre,interactive_persona_pre,data_driven_persona_pre,dynamic_persona_pre,tableau_usage_pre,api_usage_pre,ml_application_pre,persona_explanation_pre,confirmation_pre,start_date_post,end_date_post,ip_address_post,duration_sec_post,response_id_post,LocationLatitude_post,LocationLongitude_post,DistributionChannel_post,UserLanguage_post,data_collection_explanation_post,data_analysis_explanation_post,persona_building_explanation_post,evaluation_explanation_post,ai_familiarity_post,ddp_familiarity_post,data_sources_post,persona_definition_post,interactive_persona_post,data_driven_persona_post,dynamic_persona_post,tools_usage_post,api_usage_post,ml_application_post,engagement_experience_post,interaction_quality_post,communication_clarity_post,trustworthiness_post,emotional_response_post,naturalness_post,effectiveness_post,comfort_level_post,personalization_post,mental_effort_post
0,2/13/25 0:10,2/13/25 0:16,193.166.113.18,372,R_8GBoA0i1k6yogS1,63.1198,21.6798,0,0,1,60,1,1,1,20,3,34,34,234,12,10,4,1,2,2,1,"[40, 656, 539, 1440]","[58, 1272, 11, 220, 20744, 11, 220, 23033, 11,...","[58, 1272, 11, 220, 20744, 11, 220, 23033, 11,...","[40, 656, 539, 1440]",0,2025-02-13 0:56:11,2025-02-13 1:18:59,193.166.113.18,1368,R_8QL21bpDcdbdtJ4,63.1198,21.6798,0,0,"[998, 6667, 1707, 729, 6, 38462, 828, 11, 828,...","[5451, 279, 828, 2011, 387, 28822, 13, 311, 11...","[6153, 59454, 323, 35271, 11, 279, 32525, 2011...","[791, 16865, 649, 387, 2884, 384, 1326, 2637, ...",4,4,All of the above,0,1,0,0,"[1271, 19570, 279, 32525, 311, 279, 3932, 13, ...","[18433, 9959, 828, 649, 387, 12457, 505, 1124,...","[7009, 649, 387, 1511, 384, 1326, 2637, 304, 5...",5,1,7,2,2,1,6,6,1,4
1,2/13/25 0:09,2/13/25 0:16,193.166.113.32,409,R_8QG2UV0Zv0fKVB5,63.1198,21.6798,0,0,23,41,1,2,2,0,134,24,34,234,10,1,4,1,2,2,1,"[40, 656, 539, 1440, 1521, 7526, 11, 779, 358,...","[58, 1272, 11, 220, 21717, 11, 220, 22750, 11,...","[58, 1272, 11, 220, 21717, 11, 220, 22750, 11,...","[40, 617, 912, 31089, 13]",0,2025-02-13 0:56:02,2025-02-13 1:13:21,193.166.113.32,1039,R_8QrAaCxxXwbpxvL,63.1198,21.6798,0,0,"[1502, 28975, 25, 5195, 28975, 11, 41441, 11, ...","[21766, 367, 649, 387, 10887, 779, 430, 1855, ...","[4054, 1648, 311, 1893, 5575, 32525, 433, 374,...","[16, 8, 358, 617, 539, 1027, 12917, 8661, 1430...",2,2,All of the above,0,1,0,0,"[1271, 41468, 320, 438, 2713, 8, 1118, 29055, ...","[40, 617, 912, 31089, 1148, 374, 5446, 13]","[1271, 1893, 8915, 32525, 13, 220]",4,2,4,4,2,4,4,2,2,6
2,2/13/25 0:09,2/13/25 0:17,193.166.113.31,462,R_8tHW5RDZd4yBkcx,63.1198,21.6798,0,0,22,45,1,2,1,3,13,34,34,234,11,10,13,1,5,2,5,"[40, 1541, 956, 1440, 13]","[58, 1272, 11, 220, 10559, 16, 11, 220, 26067,...","[58, 1272, 11, 220, 10559, 16, 11, 220, 26067,...","[40, 1541, 956, 1440, 13]",0,2025-02-13 10:06:34,2025-02-13 10:19:38,193.166.113.31,784,R_2kRKZFGiWHUQidw,63.1198,21.6798,0,0,"[40, 1205, 47616, 828, 11, 902, 649, 387, 1245...","[16257, 40596, 29060, 527, 2631, 13, 4314, 299...","[644, 420, 6566, 11, 45167, 279, 29055, 21542,...","[2028, 374, 264, 16996, 3094, 304, 279, 29055,...",3,4,All of the above,0,2,0,1,"[1271, 9302, 1082, 279, 29055, 13]","[40, 15890, 1440, 1148, 5446, 13656, 369, 13]","[40, 1541, 956, 1440, 13]",3,4,6,6,1,4,5,3,4,5
3,2/13/25 0:11,2/13/25 0:17,193.166.113.21,355,R_8462bTis2cTNcii,63.1198,21.6798,0,0,19,39,1,2,1,1,1234,34,34,234,12,1,4,1,5,2,5,[19285],"[58, 5926, 5313, 60]","[58, 5926, 5313, 60]",[19285],0,2025-02-13 0:55:51,2025-02-13 1:11:01,193.166.113.21,909,R_2M3diy3SR17ual3,63.1198,21.6798,0,0,"[1061, 8336, 25, 80057, 11, 1912, 9256, 16865,...","[5176, 37794, 11, 6303, 369, 12912, 11, 15592,...","[817, 279, 828, 323, 1977, 18740, 32525, 369, ...","[1985, 2949, 279, 6484, 4009, 311, 1518, 422, ...",4,5,All of the above,0,1,0,0,"[695, 6492, 220]","[998, 6667, 828, 220]","[1271, 264, 2294, 3568, 13, 649, 1520, 1920, 3...",6,6,6,4,2,3,6,1,4,4
4,2/13/25 0:09,2/13/25 0:18,193.166.117.7,551,R_824J8ZB9ZGfOu8t,63.1198,21.6798,0,0,15,38,2,2,2,0,124,4,34,2,12,1,5,5,5,5,5,[12],"[58, 717, 60]","[58, 717, 60]",[12],0,2025-02-13 0:55:35,2025-02-13 1:19:16,193.166.117.7,1421,R_2KPeTVjwHtRdvU7,63.1198,21.6798,0,0,"[44041, 11, 7032, 11, 38462, 828, 11, 1268, 81...","[31717, 37794, 4236, 4184, 311, 872, 9021, 11,...","[28846, 53944, 5643, 39568, 369, 1855, 29055, ...","[29145, 300, 1288, 539, 3719, 1118, 719, 814, ...",4,2,All of the above,0,1,0,0,"[1271, 1304, 39568, 315, 279, 29055, 779, 430,...","[2520, 6492, 30, 358, 1541, 956, 1440, 1148, 3...","[1271, 2274, 32525, 4726, 994, 502, 2038, 9221...",4,2,3,3,1,4,3,6,1,6
