# Lemmatization   


In [19]:
# I had an issue with NLTK
# that is I opted for spacy
import spacy
import subprocess
import pandas as pd

In [20]:
# Function to download the spaCy model if not already installed
def download_spacy_model():
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading 'en_core_web_sm' model...")
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
        print("Download complete!")

# Ensure the spaCy model is available
download_spacy_model()
nlp = spacy.load("en_core_web_sm")

In [21]:
def lemmatize_text(text):
    """
    Function to lemmatize text using spaCy.
    :param text: str, input text
    :return: str, lemmatized text
    """
    if pd.isna(text):  # Handle NaN values
        return ""

    doc = nlp(text)  # Process the text using spaCy
    return " ".join([token.lemma_ for token in doc])  # Apply lemmatization

In [22]:
# Load datasets
datasets = [
    "dataset_word_token_stopwords.csv",
    "dataset_subword_token_stopwords.csv",
    "dataset_sentence_token_stopwords.csv",
    "dataset_bert_token_stopwords.csv",
    "dataset_tiktoken_token_stopwords.csv",
    "dataset_whitespace_token_stopwords.csv"
] 

text_columns = ["tableau_usage_pre", "api_usage_pre", "ml_application_pre",
                    "persona_explanation_pre", "api_usage_pre", "ml_application_pre",
                    "data_collection_explanation_post", "data_analysis_explanation_post", "persona_building_explanation_post",
                    "evaluation_explanation_post", "tools_usage_post", "api_usage_post",
                    "ml_application_post"]

In [23]:
# Process each dataset
for dataset in datasets:
    try:
        file_path = dataset  # Assign dataset name directly

        # Load dataset
        df = pd.read_csv(file_path)
        print(f"Dataset {dataset} loaded successfully!")

        # Apply lemmatization to each column in the list
        for col in text_columns:
            if col in df.columns:  # Ensure column exists before processing
                df[col] = df[col].astype(str).apply(lemmatize_text)

        # Generate a unique name for the processed dataset
        output_file = file_path.replace(".csv", "_lemmatize.csv")

        # Save the processed dataset (overwrite old data)
        df.to_csv(output_file, index=False)
        print(f"Processed dataset saved as {output_file}")

    except Exception as e:
        print(f"Error loading dataset {dataset}: {e}")

Dataset dataset_word_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_word_token_stopwords_lemmatize.csv
Dataset dataset_subword_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_subword_token_stopwords_lemmatize.csv
Dataset dataset_sentence_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_sentence_token_stopwords_lemmatize.csv
Dataset dataset_bert_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_bert_token_stopwords_lemmatize.csv
Dataset dataset_tiktoken_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_tiktoken_token_stopwords_lemmatize.csv
Dataset dataset_whitespace_token_stopwords.csv loaded successfully!
Processed dataset saved as dataset_whitespace_token_stopwords_lemmatize.csv
