The current code does not use a classifier; it directly calculates participants’ knowledge progression based on their pre-test and post-test scores. This approach is more of a rule-based classification rather than using machine learning.

How It Works:
	•	Computes the total score difference for each participant.
	•	Applies a rule-based function to classify them into three groups:
	•	"Improved" → If post-test score is higher than pre-test.
	•	"Declined" → If post-test score is lower than pre-test.
	•	"Same Stage" → If post-test score did not change.

This is a deterministic approach based on numeric differences, not a machine learning classifier.


YOU can keep only labels , make them numeric and based on this data , create clusters

In [3]:
import numpy as np
import pandas as pd
import os

# Define survey files
survey_files = [
    "01.1_df_word_token_stopwords_lemmatize_vectorized.npz",
    "01.2_df_subword_token_stopwords_lemmatize_vectorized.npz",
    "01.3_df_sentence_token_stopwords_lemmatize_vectorized.npz",
    "01.4_df_bert_token_stopwords_lemmatize_vectorized.npz",
    "01.5_df_tiktoken_token_stopwords_lemmatize_vectorized.npz",
    "01.6_df_whitespace_token_stopwords_lemmatize_vectorized.npz"
]

# List of relevant columns for grading
columns_to_compare = [
    "persona_definition_pre_grade", "persona_definition_post_grade",
    "interactive_persona_pre_grade", "interactive_persona_post_grade",
    "data_driven_persona_pre_grade", "data_driven_persona_post_grade",
    "dynamic_persona_pre_grade", "dynamic_persona_post_grade"
]

def classify_progress(pre, post):
    if pre == "Correct" and post == "Not Correct":
        return "DECLINED"
    elif pre == "Not Correct" and post == "Correct":
        return "IMPROVED"
    else:
        return "SAME"

def analyze_group_performance(df):
    df["progress"] = df.apply(lambda row: classify_progress(row[columns_to_compare[0]], row[columns_to_compare[1]]), axis=1)
    for i in range(2, len(columns_to_compare), 2):
        df["progress"] += "-" + df.apply(lambda row: classify_progress(row[columns_to_compare[i]], row[columns_to_compare[i+1]]), axis=1)
    
    # Count occurrences per group
    group_summary = df.groupby("group_class")["progress"].apply(lambda x: x.str.split("-").explode().value_counts()).unstack().fillna(0)
    
    # Calculate percentage improvements
    group_summary["Total"] = group_summary.sum(axis=1)
    group_summary["IMPROVEMENT_RATE"] = group_summary["IMPROVED"] / group_summary["Total"]
    
    return group_summary

all_results = []

for file in survey_files:
    if os.path.exists(file):
        data = np.load(file, allow_pickle=True)
        df = pd.DataFrame(data["arr_0"], columns=data["columns"])
        group_performance = analyze_group_performance(df)
        group_performance["Dataset"] = file
        all_results.append(group_performance)

# Combine results from all datasets
final_results = pd.concat(all_results)

# Display the final results
target_columns = ["IMPROVED", "DECLINED", "SAME", "IMPROVEMENT_RATE", "Dataset"]
final_results = final_results[target_columns]
import ace_tools as tools
tools.display_dataframe_to_user(name="Group Performance Comparison", dataframe=final_results)


KeyError: 'arr_0 is not a file in the archive'