# Lemmatization   


In [1]:
# I had an issue with NLTK
# that is I opted for spacy
import spacy
import subprocess
import pandas as pd

In [2]:
# Function to download the spaCy model if not already installed
def download_spacy_model():
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading 'en_core_web_sm' model...")
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
        print("Download complete!")

# Ensure the spaCy model is available
download_spacy_model()

In [3]:
def lemmatize_text(text):
    """
    Function to lemmatize text using spaCy.
    :param text: str, input text
    :return: str, lemmatized text
    """
    if pd.isna(text):  # Handle NaN values
        return ""

    doc = nlp(text)  # Process the text using spaCy
    return " ".join([token.lemma_ for token in doc])  # Apply lemmatization

In [4]:
# Load datasets
datasets = [
    "dataset_word_token.csv",
    "dataset_subword_token.csv",
    "dataset_sentence_token.csv",
    "dataset_bert_token.csv",
    "dataset_tiktoken_token.csv",
    "dataset_whitespace_token.csv"
] 

text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [5]:
# Process each dataset
for dataset in datasets:
    try:
        file_path = dataset  # Assign dataset name directly

        # Load dataset
        df = pd.read_csv(file_path)
        print(f"Dataset {dataset} loaded successfully!")

        # Apply lemmatization to each column in the list
        for col in text_columns:
            if col in df.columns:  # Ensure column exists before processing
                df[col] = df[col].astype(str).apply(lemmatize_text)

        # Generate a unique name for the processed dataset
        output_file = file_path.replace(".csv", "_lemmatize.csv")

        # Save the processed dataset (overwrite old data)
        df.to_csv(output_file, index=False)
        print(f"Processed dataset saved as {output_file}")

    except Exception as e:
        print(f"Error loading dataset {dataset}: {e}")

Dataset dataset_word_token.csv loaded successfully!
Error loading dataset dataset_word_token.csv: name 'nlp' is not defined
Dataset dataset_subword_token.csv loaded successfully!
Error loading dataset dataset_subword_token.csv: name 'nlp' is not defined
Dataset dataset_sentence_token.csv loaded successfully!
Error loading dataset dataset_sentence_token.csv: name 'nlp' is not defined
Dataset dataset_bert_token.csv loaded successfully!
Error loading dataset dataset_bert_token.csv: name 'nlp' is not defined
Dataset dataset_tiktoken_token.csv loaded successfully!
Error loading dataset dataset_tiktoken_token.csv: name 'nlp' is not defined
Dataset dataset_whitespace_token.csv loaded successfully!
Error loading dataset dataset_whitespace_token.csv: name 'nlp' is not defined


In [6]:
# Display the cleaned dataset
# Ensure all columns are displayed
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,start_date_pre,end_date_pre,ip_address_pre,duration_sec_pre,response_id_pre,LocationLatitude_pre,LocationLongitude_pre,DistributionChannel_pre,UserLanguage_pre,participant_id,age_pre,gender_pre,occupation_pre,teaching_marketing_pre,teaching_experience_pre,learning_style_pre,learning_format_pre,interaction_preference_pre,trusted_learning_method_pre,ai_familiarity_pre,ddp_familiarity_pre,data_sources_pre,persona_definition_pre,interactive_persona_pre,data_driven_persona_pre,dynamic_persona_pre,tableau_usage_pre,api_usage_pre,ml_application_pre,persona_explanation_pre,confirmation_pre,start_date_post,end_date_post,ip_address_post,duration_sec_post,response_id_post,LocationLatitude_post,LocationLongitude_post,DistributionChannel_post,UserLanguage_post,data_collection_explanation_post,data_analysis_explanation_post,persona_building_explanation_post,evaluation_explanation_post,ai_familiarity_post,ddp_familiarity_post,data_sources_post,persona_definition_post,interactive_persona_post,data_driven_persona_post,dynamic_persona_post,tools_usage_post,api_usage_post,ml_application_post,engagement_experience_post,interaction_quality_post,communication_clarity_post,trustworthiness_post,emotional_response_post,naturalness_post,effectiveness_post,comfort_level_post,personalization_post,mental_effort_post
0,2/13/25 0:10,2/13/25 0:16,193.166.113.18,372,R_8GBoA0i1k6yogS1,63.1198,21.6798,0,0,1,60,1,1,1,20,3,34,34,234,12,10,4,1,2,2,1,"['I', 'do', 'not', 'know']","[""['I',"", ""'do',"", ""'not',"", ""'know']""]","[""['I',"", ""'do',"", ""'not',"", ""'know']""]","['I', 'do', 'not', 'know']",0,2025-02-13 0:56:11,2025-02-13 1:18:59,193.166.113.18,1368,R_8QL21bpDcdbdtJ4,63.1198,21.6798,0,0,"['to', 'collect', ""studens'"", 'demographic', '...","['First', 'the', 'data', 'must', 'be', 'cleane...","['After', 'clustering', 'and', 'combining,', '...","['The', 'evaluation', 'can', 'be', 'done', 'e....",4,4,All of the above,0,1,0,0,"['To', 'communicate', 'the', 'personas', 'to',...","['Because', 'relevant', 'data', 'can', 'be', '...","['They', 'can', 'be', 'used', 'e.g.,', 'in', '...",5,1,7,2,2,1,6,6,1,4
1,2/13/25 0:09,2/13/25 0:16,193.166.113.32,409,R_8QG2UV0Zv0fKVB5,63.1198,21.6798,0,0,23,41,1,2,2,0,134,24,34,234,10,1,4,1,2,2,1,"['I', 'do', 'not', 'know', 'these', 'tools,', ...","[""['I',"", ""'have',"", ""'no',"", ""'clue.']""]","[""['I',"", ""'have',"", ""'no',"", ""'clue.']""]","['I', 'have', 'no', 'clue.']",0,2025-02-13 0:56:02,2025-02-13 1:13:21,193.166.113.32,1039,R_8QrAaCxxXwbpxvL,63.1198,21.6798,0,0,"['User', 'analytics:', 'Google', 'analytics,',...","['Segmentation', 'can', 'be', 'performed', 'so...","['One', 'way', 'to', 'create', 'student', 'per...","['1)', 'I', 'have', 'not', 'been', 'teaching',...",2,2,All of the above,0,1,0,0,"['To', 'illustrate', '(and', 'update)', 'stati...","['I', 'have', 'no', 'clue', 'what', 'is', 'API.']","['To', 'create', 'dynamic', 'personas.']",4,2,4,4,2,4,4,2,2,6
2,2/13/25 0:09,2/13/25 0:17,193.166.113.31,462,R_8tHW5RDZd4yBkcx,63.1198,21.6798,0,0,22,45,1,2,1,3,13,34,34,234,11,10,13,1,5,2,5,"['I', ""don't"", 'know.']","[""['I',"", '""don\'t"",', ""'know',"", ""'what',"", ""...","[""['I',"", '""don\'t"",', ""'know.']""]","['I', ""don't"", 'know.']",0,2025-02-13 10:06:34,2025-02-13 10:19:38,193.166.113.31,784,R_2kRKZFGiWHUQidw,63.1198,21.6798,0,0,"['I', 'need', 'quantitative', 'data,', 'which'...","['Statistical', 'analyses', 'are', 'required.'...","['In', 'this', 'stage,', 'crafting', 'the', 'p...","['This', 'is', 'a', 'crucial', 'step', 'in', '...",3,4,All of the above,0,2,0,1,"['To', 'visualise', 'the', 'persona.']","['I', 'dont', 'know', 'what', 'API', 'stands',...","['I', ""don't"", 'know.']",3,4,6,6,1,4,5,3,4,5
3,2/13/25 0:11,2/13/25 0:17,193.166.113.21,355,R_8462bTis2cTNcii,63.1198,21.6798,0,0,19,39,1,2,1,1,1234,34,34,234,12,1,4,1,5,2,5,['nan'],"[""['nan']""]","[""['nan']""]",['nan'],0,2025-02-13 0:55:51,2025-02-13 1:11:01,193.166.113.21,909,R_2M3diy3SR17ual3,63.1198,21.6798,0,0,"['Data', 'sources:', 'Attendance,', 'group', '...","['Clustering,', 'matrix', 'for', 'patterns,', ...","['use', 'the', 'data', 'and', 'build', 'repres...","['test', 'within', 'the', 'existing', 'network...",4,5,All of the above,0,1,0,0,"['data', 'analysis']","['to', 'collect', 'data']","['To', 'a', 'great', 'deal.', 'can', 'help', '...",6,6,6,4,2,3,6,1,4,4
4,2/13/25 0:09,2/13/25 0:18,193.166.117.7,551,R_824J8ZB9ZGfOu8t,63.1198,21.6798,0,0,15,38,2,2,2,0,124,4,34,2,12,1,5,5,5,5,5,['-'],"[""['-']""]","[""['-']""]",['-'],0,2025-02-13 0:55:35,2025-02-13 1:19:16,193.166.117.7,1421,R_2KPeTVjwHtRdvU7,63.1198,21.6798,0,0,"['-age,', 'jobs,', 'demographic', 'data,', 'ho...","['-clustering', 'students', 'according', 'to',...","['-making', 'descriptive', 'profile', 'posters...","['-personas', 'should', 'not', 'become', 'stat...",4,2,All of the above,0,1,0,0,"['To', 'make', 'posters', 'of', 'the', 'person...","['For', 'analysis?', 'I', ""don't"", 'know', 'wh...","['To', 'develop', 'personas', 'further', 'when...",4,2,3,3,1,4,3,6,1,6
