# Sentence Tokenization


In [8]:
# I had an issue with NLTK
# that is I opted for spacy
import spacy
import subprocess
import pandas as pd

In [9]:
# Function to download and load spaCy model
def download_spacy_model():
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading 'en_core_web_sm' model...")
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
        print("Download complete!")

# Ensure spaCy model is available
download_spacy_model()

In [10]:
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

In [11]:
def sentence_tokenization(text):
    """
    Function to tokenize text into sentences using spaCy.
    :param text: str, input text
    :return: list, tokenized sentences
    """
    doc = nlp(text)  # Process the text using spaCy
    return [sent.text for sent in doc.sents]  # Extract sentence tokens

In [12]:
# Load dataset
file_path = "all_numeric_survey_labels.csv" 
text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [13]:
try:
    df = pd.read_csv(file_path)  # Load CSV file
    print("Dataset loaded successfully!")

    # Apply lemmatization to each text column
    for col in text_columns:
        if col in df.columns:  # Ensure column exists before processing
            df[col] = df[col].astype(str).apply(sentence_tokenization)

    # Save the processed dataset
    output_file = "01.3_df_sentence_token.csv"
    df.to_csv(output_file, index=False)
    print(f"Processed dataset saved as {output_file}")

except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset loaded successfully!
Processed dataset saved as 01.3_df_sentence_token.csv


In [14]:
# Display the cleaned dataset
# Ensure all columns are displayed
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,start_date_pre,end_date_pre,ip_address_pre,duration_sec_pre,response_id_pre,LocationLatitude_pre,LocationLongitude_pre,DistributionChannel_pre,UserLanguage_pre,participant_id,age_pre,gender_pre,occupation_pre,teaching_marketing_pre,teaching_experience_pre,learning_style_pre,learning_format_pre,interaction_preference_pre,trusted_learning_method_pre,ai_familiarity_pre,ddp_familiarity_pre,data_sources_pre,persona_definition_pre,interactive_persona_pre,data_driven_persona_pre,dynamic_persona_pre,tableau_usage_pre,api_usage_pre,ml_application_pre,persona_explanation_pre,confirmation_pre,start_date_post,end_date_post,ip_address_post,duration_sec_post,response_id_post,LocationLatitude_post,LocationLongitude_post,DistributionChannel_post,UserLanguage_post,data_collection_explanation_post,data_analysis_explanation_post,persona_building_explanation_post,Unnamed: 43,evaluation_explanation_post,ai_familiarity_post,ddp_familiarity_post,data_sources_post,persona_definition_post,interactive_persona_post,data_driven_persona_post,dynamic_persona_post,tools_usage_post,api_usage_post,ml_application_post,engagement_experience_post,interaction_quality_post,communication_clarity_post,trustworthiness_post,emotional_response_post,naturalness_post,effectiveness_post,comfort_level_post,personalization_post,mental_effort_post,persona_definition_pre_grade,persona_definition_post_grade,persona_definition_grade_comparison,interactive_persona_pre_grade,interactive_persona_post_grade,interactive_persona_grade_comparison,data_driven_persona_pre_grade,data_driven_persona_post_grade,data_driven_persona_grade_comparison,dynamic_persona_pre_grade,dynamic_persona_post_grade,dynamic_persona_grade_comparison,data_collection_explanation_post_grade,data_analysis_explanation_post_grade,persona_building_explanation_post_grade,evaluation_explanation_post_grade,group_class
0,2/13/25 0:10,2/13/25 0:16,193.166.113.18,372,R_8GBoA0i1k6yogS1,63.1198,21.6798,0,0,1,60,1,1,1,20,3,34,34,234,12,10,4,1,2,2,1,[I do not know],[['I do not know']],[['I do not know']],[I do not know],0,2025-02-13 0:56:11,2025-02-13 1:18:59,193.166.113.18,1368,R_8QL21bpDcdbdtJ4,63.1198,21.6798,0,0,"[to collect studens' demographic data, data of...","[First the data must be cleaned., to achieve h...","[After clustering and combining, the personas ...",1,"[The evaluation can be done e.g., with traditi...",4,4,All of the above,0,1,0,0,"[To communicate the personas to the users., To...",[Because relevant data can be obtained from th...,"[They can be used e.g., in clustering and comb...",5,1,7,2,2,1,6,6,1,4,Correct,Not Correct,DECLINED,Correct,Correct,SAME,Correct,Not Correct,DECLINED,Correct,Not Correct,DECLINED,HIGH,LOW,HIGH,MEDIUM,Chatbot
1,2/13/25 0:09,2/13/25 0:16,193.166.113.32,409,R_8QG2UV0Zv0fKVB5,63.1198,21.6798,0,0,23,41,1,2,2,0,134,24,34,234,10,1,4,1,2,2,1,"[I do not know these tools, so I cannot answer.]",[['I have no clue.']],[['I have no clue.']],[I have no clue.],0,2025-02-13 0:56:02,2025-02-13 1:13:21,193.166.113.32,1039,R_8QrAaCxxXwbpxvL,63.1198,21.6798,0,0,"[User analytics: Google analytics, CRM, APIs e...",[Segmentation can be performed so that each pe...,[One way to create student personas it is to i...,2,[1) I have not been teaching marketing courses...,2,2,All of the above,0,1,0,0,[To illustrate (and update) static persona pro...,[I have no clue what is API.],[To create dynamic personas.],4,2,4,4,2,4,4,2,2,6,Correct,Not Correct,DECLINED,Correct,Correct,SAME,Correct,Not Correct,DECLINED,Correct,Not Correct,DECLINED,LOW,HIGH,LOW,HIGH,Chatbot
2,2/13/25 0:09,2/13/25 0:17,193.166.113.31,462,R_8tHW5RDZd4yBkcx,63.1198,21.6798,0,0,22,45,1,2,1,3,13,34,34,234,11,10,13,1,5,2,5,[I don't know.],"[[""I don't know what API stands for.""]]","[[""I don't know.""]]",[I don't know.],0,2025-02-13 10:06:34,2025-02-13 10:19:38,193.166.113.31,784,R_2kRKZFGiWHUQidw,63.1198,21.6798,0,0,"[I need quantitative data, which can be obtain...","[Statistical analyses are required., These inc...","[In this stage, crafting the persona profiles ...",3,[This is a crucial step in the persona buildin...,3,4,All of the above,0,2,0,1,[To visualise the persona.],[I dont know what API stands for.],[I don't know.],3,4,6,6,1,4,5,3,4,5,Correct,Not Correct,DECLINED,Not Correct,Not Correct,SAME,Correct,Not Correct,DECLINED,Not Correct,Correct,IMPROVED,MEDIUM,HIGH,HIGH,HIGH,Deepfake
3,2/13/25 0:11,2/13/25 0:17,193.166.113.21,355,R_8462bTis2cTNcii,63.1198,21.6798,0,0,19,39,1,2,1,1,1234,34,34,234,12,1,4,1,5,2,5,[nan],[['nan']],[['nan']],[nan],0,2025-02-13 0:55:51,2025-02-13 1:11:01,193.166.113.21,909,R_2M3diy3SR17ual3,63.1198,21.6798,0,0,"[Data sources: Attendance, group tasks evaluat...","[Clustering, matrix for patterns, AI or softwa...",[use the data and build representative persona...,4,[test within the existing network to see if th...,4,5,All of the above,0,1,0,0,[data analysis],[to collect data],"[To a great deal., can help process large amou...",6,6,6,4,2,3,6,1,4,4,Correct,Not Correct,DECLINED,Not Correct,Correct,IMPROVED,Correct,Not Correct,DECLINED,Not Correct,Not Correct,SAME,HIGH,MEDIUM,MEDIUM,MEDIUM,Chatbot
4,2/13/25 0:09,2/13/25 0:18,193.166.117.7,551,R_824J8ZB9ZGfOu8t,63.1198,21.6798,0,0,15,38,2,2,2,0,124,4,34,2,12,1,5,5,5,5,5,[-],[['-']],[['-']],[-],0,2025-02-13 0:55:35,2025-02-13 1:19:16,193.166.117.7,1421,R_2KPeTVjwHtRdvU7,63.1198,21.6798,0,0,"[-age, jobs, demographic data, how they behave...",[-clustering students according to their goals...,[-making descriptive profile posters for each ...,5,[-personas should not become static but they m...,4,2,All of the above,0,1,0,0,[To make posters of the persona so that everyo...,"[For analysis?, I don't know what is API.]",[To develop personas further when new informat...,4,2,3,3,1,4,3,6,1,6,Not Correct,Not Correct,SAME,Not Correct,Correct,IMPROVED,Not Correct,Not Correct,SAME,Not Correct,Not Correct,SAME,MEDIUM,MEDIUM,LOW,MEDIUM,Chatbot
