In [None]:
import pandas as pd

# Load Qualtrics dataset
qualtrics_file = './dataset/EduRec_cleaned.csv'
qualtrics_df = pd.read_csv(qualtrics_file)

# Step 1: Remove metadata/header rows and retain participant data
qualtrics_trimmed = qualtrics_df.iloc[2:].copy()
qualtrics_trimmed['participant_id'] = qualtrics_trimmed['Random ID'].astype(int)

# Step 2: Define task time question mappings (First Click and Page Submit columns)
task_time_questions = {
    81: ('Q81_First Click', 'Q81_Page Submit'),
    82: ('Q82_First Click', 'Q82_Page Submit'),
    83: ('Q83_First Click', 'Q83_Page Submit'),
    85: ('Q85_First Click', 'Q85_Page Submit'),
    60: ('Q60_First Click', 'Q60_Page Submit'),
    86: ('Q86_First Click', 'Q86_Page Submit'),
    87: ('Q87_First Click', 'Q87_Page Submit'),
    88: ('Q88_First Click', 'Q88_Page Submit'),
    89: ('Q89_First Click', 'Q89_Page Submit'),
    90: ('Q90_First Click', 'Q90_Page Submit'),
    91: ('Q91_First Click', 'Q91_Page Submit'),
    92: ('Q92_First Click', 'Q92_Page Submit')
}

# Step 3: Compute task times
task_times = []
for qn, (first_click_col, submit_col) in task_time_questions.items():
    temp = qualtrics_trimmed[['participant_id', first_click_col, submit_col]].copy()
    temp['task_qns'] = qn
    temp['task_time'] = pd.to_numeric(temp[submit_col], errors='coerce') - pd.to_numeric(temp[first_click_col], errors='coerce')
    task_times.append(temp[['participant_id', 'task_qns', 'task_time']])

task_times_df = pd.concat(task_times)

# Step 4: Define feedback questions (Qx_1 to Qx_7 per variant)
feedback_detailed = []
for variant, q_prefix in {'A': 'Q45', 'B': 'Q46', 'C': 'Q47', 'D': 'Q50'}.items():
    feedback_cols = [f"{q_prefix}_{i}" for i in range(1, 8)]
    subset = qualtrics_trimmed[['participant_id'] + feedback_cols].copy()
    subset['variant'] = variant
    subset['feedback_avg'] = subset[feedback_cols].apply(pd.to_numeric, errors='coerce').mean(axis=1)
    subset['feedback_scores'] = subset[feedback_cols].apply(lambda row: row.tolist(), axis=1)
    feedback_detailed.append(subset[['participant_id', 'variant', 'feedback_avg', 'feedback_scores']])

feedback_detailed_df = pd.concat(feedback_detailed)

# Step 5: Map task_qns to variant and task type
task_mapping = pd.DataFrame({
    'variant': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'],
    'task_type': ['view class info', 'view personal info', 'view finance info'] * 4,
    'task_qns': [81, 82, 83, 85, 60, 86, 87, 88, 89, 90, 91, 92]
})

# Step 6: Merge task times with mapping
task_times_merged = task_times_df.merge(task_mapping, on='task_qns', how='left')

# Step 7: Merge with detailed feedback
final_detailed_df = task_times_merged.merge(feedback_detailed_df, on=['participant_id', 'variant'], how='left')

final_detailed_df.to_csv('agg_metric_qualtrics.csv', index=False)