In [1]:
import os
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import load_npz

In [2]:
# List of datasets (using .npz files for vectorized data)
survey_files = [
    "01.1_df_word_token_stopwords_lemmatize_vectorized.npz",
    "01.2_df_subword_token_stopwords_lemmatize_vectorized.npz",
    "01.3_df_sentence_token_stopwords_lemmatize_vectorized.npz",
    "01.4_df_bert_token_stopwords_lemmatize_vectorized.npz",
    "01.5_df_tiktoken_token_stopwords_lemmatize_vectorized.npz",
    "01.6_df_whitespace_token_stopwords_lemmatize_vectorized.npz"
]

In [3]:
# Corresponding CSV files with labels
label_files = [
    "01.1_df_word_token_stopwords_lemmatize.csv",
    "01.2_df_subword_token_stopwords_lemmatize.csv",
    "01.3_df_sentence_token_stopwords_lemmatize.csv",
    "01.4_df_bert_token_stopwords_lemmatize.csv",
    "01.5_df_tiktoken_token_stopwords_lemmatize.csv",
    "01.6_df_whitespace_token_stopwords_lemmatize.csv"
]

In [4]:
# Label columns for different classification tasks
label_columns = {
    "binary_correctness": [
        "persona_definition_pre_grade", "persona_definition_post_grade",
        "interactive_persona_pre_grade", "interactive_persona_post_grade",
        "data_driven_persona_pre_grade", "data_driven_persona_post_grade",
        "dynamic_persona_pre_grade", "dynamic_persona_post_grade"
    ],
    "comparison_labels": [
        "persona_definition_pre_grade", "persona_definition_post_grade",
        "interactive_persona_pre_grade", "interactive_persona_post_grade",
        "data_driven_persona_pre_grade", "data_driven_persona_post_grade",
        "dynamic_persona_pre_grade", "dynamic_persona_post_grade"
    ],
    "ai_grading": [
        "data_collection_explanation_post_grade", 
        "data_analysis_explanation_post_grade",
        "persona_building_explanation_post_grade",
        "evaluation_explanation_post_grade"
    ],
    "group_classification": ["group_class"]
}

In [5]:
# Process each dataset
for npz_file, csv_file in zip(survey_files, label_files):
    if os.path.exists(npz_file) and os.path.exists(csv_file):
        # Load TF-IDF vectorized features from .npz file
        npz_data = np.load(npz_file, allow_pickle=True)
        X_matrix = load_npz(npz_file)

        # Convert to CSR format for efficient processing
        X_matrix = X_matrix.tocsr()

        # Load labels from the corresponding CSV file
        df = pd.read_csv(csv_file)
        print(f"Loaded: {npz_file} (TF-IDF shape: {X_matrix.shape}) and {csv_file} (Rows: {df.shape[0]})")

        # Train a separate model for each label column
        for category, label_cols in label_columns.items():
            for label_col in label_cols:
                if label_col in df.columns:
                    print(f"\nTraining model for: {label_col} (Category: {category})")

                    # Extract labels and remove missing values
                    df_filtered = df[[label_col]].dropna()
                    y = df_filtered[label_col]
                    X_filtered = X_matrix[df_filtered.index]

                    # Train-test split
                    X_train, X_test, y_train, y_test = train_test_split(
                        X_filtered, y, test_size=0.2, random_state=42
                    )

                    # Train Naïve Bayes classifier
                    model = LogisticRegression()
                    model.fit(X_train, y_train)

                    # Make predictions
                    y_pred = model.predict(X_test)

                    # Evaluate the model
                    accuracy = accuracy_score(y_test, y_pred)
                    print(f"Model Accuracy for {label_col}: {accuracy:.4f}")
                    print("Classification Report:\n", classification_report(y_test, y_pred))

                    # Show a sample of model predictions
                    df_results = pd.DataFrame({"True Label": y_test, "Predicted Label": y_pred})
                    print("\nSample of Model Classifications:")
                    print(df_results.sample(min(10, len(df_results))))
    else:
        print(f"File not found: {npz_file} or {csv_file}")
# Show full classification results
print("\nFull Classification Results:")
print(df_results)

# Save results to CSV for further analysis
df_results.to_csv("classification_results.csv", index=False)
print("\nClassification results saved to 'classification_results.csv'")

# Display participants grouped by predicted label (if Participant ID exists)
if 'Participant ID' in df_results.columns:
    grouped_results = df_results.groupby("Predicted Label")["Participant ID"].apply(list)
    print("\nParticipants grouped by predicted class:")
    print(grouped_results)


Loaded: 01.1_df_word_token_stopwords_lemmatize_vectorized.npz (TF-IDF shape: (26, 339)) and 01.1_df_word_token_stopwords_lemmatize.csv (Rows: 26)

Training model for: persona_definition_pre_grade (Category: binary_correctness)


ValueError: Negative values in data passed to MultinomialNB (input X)