In [None]:
# Set seed for reproducibility
SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import random
import numpy as np
np.random.seed(SEED)
random.seed(SEED)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 8)

print("‚úÖ Libraries imported")

## 1. Load Data

In [None]:
# Load raw data (before preprocessing)
df_train = pd.read_csv('pirate_pain_train.csv')
df_labels = pd.read_csv('pirate_pain_train_labels.csv')
df_test = pd.read_csv('pirate_pain_test.csv')

# Merge labels for analysis
df_train = pd.merge(df_train, df_labels, on='sample_index', how='left')

# Define joint columns (excluding joint_30 and joint_02 already removed)
all_joints = [f'joint_{str(i).zfill(2)}' for i in range(31)]
remaining_joints = [j for j in all_joints if j not in ['joint_30', 'joint_02']]

print(f"Total samples (train): {len(df_train)}")
print(f"Unique pirates (train): {df_train['sample_index'].nunique()}")
print(f"Unique pirates (test): {df_test['sample_index'].nunique()}")
print(f"\nJoint columns analyzed: {len(remaining_joints)}")
print(f"Excluded: joint_30 (zero variance), joint_02 (high correlation)")

df_train.head()

## 2. Outlier Detection

**Yoda's question:** "That one row, so different from all others... an error, or a rare jewel is it?"

We'll use multiple methods to detect outliers:
1. **Z-score method**: Statistical outliers (>3 standard deviations)
2. **IQR method**: Values outside Q1-1.5√óIQR to Q3+1.5√óIQR
3. **Isolation Forest**: ML-based anomaly detection

### 2.1 Z-Score Outlier Detection

In [None]:
# Calculate Z-scores for all joint columns
z_scores = np.abs(stats.zscore(df_train[remaining_joints], nan_policy='omit'))

# Find rows with any Z-score > 3 (extreme outliers)
outlier_threshold = 3
outlier_mask = (z_scores > outlier_threshold).any(axis=1)
outlier_rows = df_train[outlier_mask]

print(f"Z-Score Outlier Detection (threshold={outlier_threshold}):")
print("=" * 70)
print(f"Total outlier rows: {len(outlier_rows)} / {len(df_train)} ({len(outlier_rows)/len(df_train)*100:.2f}%)")
print(f"Outlier pirates: {outlier_rows['sample_index'].nunique()}")

# Check distribution of outliers by pain label
print("\nOutlier distribution by pain label:")
outlier_label_dist = outlier_rows['label'].value_counts()
for label, count in outlier_label_dist.items():
    total_label = len(df_train[df_train['label'] == label])
    pct = (count / total_label) * 100
    print(f"  {label:10s}: {count:5d} outliers / {total_label:6d} total ({pct:.2f}%)")

# Find which joints have most outliers
outliers_per_joint = (z_scores > outlier_threshold).sum(axis=0)
outliers_per_joint = pd.Series(outliers_per_joint, index=remaining_joints).sort_values(ascending=False)

print("\nTop 10 joints with most outliers:")
for joint, count in outliers_per_joint.head(10).items():
    print(f"  {joint}: {count} outliers")

In [None]:
# Visualize outliers per joint
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Bar plot of outliers per joint
axes[0].bar(range(len(outliers_per_joint)), outliers_per_joint.values, color='coral', edgecolor='black')
axes[0].set_xticks(range(0, len(outliers_per_joint), 3))
axes[0].set_xticklabels([remaining_joints[i] for i in range(0, len(remaining_joints), 3)], rotation=45)
axes[0].set_xlabel('Joint Column')
axes[0].set_ylabel('Number of Outliers (Z-score > 3)')
axes[0].set_title('Outlier Distribution Across Joints')
axes[0].grid(True, alpha=0.3, axis='y')

# Distribution of max Z-score per row
max_z_scores = z_scores.max(axis=1)
axes[1].hist(max_z_scores, bins=50, edgecolor='black', alpha=0.7, color='skyblue')
axes[1].axvline(x=outlier_threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold = {outlier_threshold}')
axes[1].set_xlabel('Maximum Z-Score per Row')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Maximum Z-Scores')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\nüìä Interpretation:")
print(f"   - If outliers are EVENLY distributed ‚Üí likely measurement noise")
print(f"   - If outliers are CONCENTRATED in specific joints ‚Üí those joints may be problematic")
print(f"   - If outliers are CONCENTRATED in specific pain labels ‚Üí they might be informative!")

### 2.2 IQR (Interquartile Range) Method

In [None]:
# Calculate IQR-based outliers for each joint
Q1 = df_train[remaining_joints].quantile(0.25)
Q3 = df_train[remaining_joints].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers
iqr_outliers = ((df_train[remaining_joints] < lower_bound) | (df_train[remaining_joints] > upper_bound)).any(axis=1)
iqr_outlier_rows = df_train[iqr_outliers]

print(f"IQR Outlier Detection:")
print("=" * 70)
print(f"Total outlier rows: {len(iqr_outlier_rows)} / {len(df_train)} ({len(iqr_outlier_rows)/len(df_train)*100:.2f}%)")
print(f"Outlier pirates: {iqr_outlier_rows['sample_index'].nunique()}")

# Comparison with Z-score method
both_methods = outlier_mask & iqr_outliers
print(f"\nOverlap between methods:")
print(f"  Detected by both Z-score AND IQR: {both_methods.sum()} rows")
print(f"  Only Z-score: {(outlier_mask & ~iqr_outliers).sum()} rows")
print(f"  Only IQR: {(~outlier_mask & iqr_outliers).sum()} rows")

### 2.3 Are Outliers "Errors" or "Rare Jewels"?

In [None]:
# Analyze if outliers are associated with pain labels
print("üîç Analyzing if outliers are informative or just noise...\n")

# Compare outlier rates across pain labels
pain_labels = df_train['label'].unique()
outlier_rates = {}

for label in pain_labels:
    label_mask = df_train['label'] == label
    outlier_rate = (outlier_mask & label_mask).sum() / label_mask.sum()
    outlier_rates[label] = outlier_rate

print("Outlier rate by pain label (Z-score method):")
print("=" * 70)
for label, rate in sorted(outlier_rates.items(), key=lambda x: x[1], reverse=True):
    print(f"  {label:10s}: {rate*100:.2f}% of samples are outliers")

# Statistical test: Are outlier rates significantly different across labels?
from scipy.stats import chi2_contingency

# Create contingency table: [label] x [is_outlier]
contingency_table = pd.crosstab(df_train['label'], outlier_mask)
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nüìä Chi-square test:")
print(f"   Chi-square statistic: {chi2:.4f}")
print(f"   P-value: {p_value:.4e}")

if p_value < 0.05:
    print(f"   ‚úÖ SIGNIFICANT! Outliers are associated with pain labels ‚Üí They are 'RARE JEWELS' (keep them!)")
else:
    print(f"   ‚ö†Ô∏è NOT SIGNIFICANT. Outliers are randomly distributed ‚Üí They might be 'ERRORS' (consider removing)")

print("\n" + "=" * 70)
print("üí° Recommendation:")
if p_value < 0.05:
    print("   Keep outliers - they contain information about pain patterns!")
else:
    print("   Consider removing extreme outliers - they are likely measurement noise")
print("=" * 70)

## 3. Feature Usefulness Analysis

**Yoda's wisdom:** "Not all that you see, is signal."

Analyze which joints are actually useful for predicting pain:
1. **Variance analysis**: Low variance = not useful
2. **Correlation with target**: No correlation = not useful
3. **ANOVA F-test**: Statistical significance
4. **Train/Test distribution mismatch**: Different distributions = problematic

### 3.1 Variance Analysis

In [None]:
# Calculate variance for each joint
variances = df_train[remaining_joints].var()
variances_sorted = variances.sort_values()

print("Joint Variance Analysis:")
print("=" * 70)
print(f"\nLowest 10 variances (potentially useless features):")
for joint, var in variances_sorted.head(10).items():
    print(f"  {joint}: {var:.6e}")

print(f"\nHighest 10 variances (most variable features):")
for joint, var in variances_sorted.tail(10).items():
    print(f"  {joint}: {var:.6e}")

# Define threshold for low variance (you noted joints with e-7 values)
low_variance_threshold = 1e-6
low_variance_joints = variances[variances < low_variance_threshold].index.tolist()

print(f"\n‚ö†Ô∏è Joints with variance < {low_variance_threshold:.0e}:")
if low_variance_joints:
    for joint in low_variance_joints:
        print(f"  {joint}: {variances[joint]:.6e}")
else:
    print("  None found")

In [None]:
# Visualize variance distribution
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Bar plot of variances
axes[0].bar(range(len(variances_sorted)), variances_sorted.values, color='steelblue', edgecolor='black')
axes[0].axhline(y=low_variance_threshold, color='red', linestyle='--', linewidth=2, 
                label=f'Low variance threshold: {low_variance_threshold:.0e}')
axes[0].set_xticks(range(0, len(variances_sorted), 3))
axes[0].set_xticklabels([variances_sorted.index[i] for i in range(0, len(variances_sorted), 3)], rotation=45)
axes[0].set_xlabel('Joint Column (sorted by variance)')
axes[0].set_ylabel('Variance')
axes[0].set_title('Variance Distribution Across Joints')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].set_yscale('log')  # Log scale to see small variances

# Histogram of log-variances
log_variances = np.log10(variances + 1e-12)  # Add small constant to avoid log(0)
axes[1].hist(log_variances, bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(x=np.log10(low_variance_threshold), color='red', linestyle='--', linewidth=2,
                label=f'Threshold: {low_variance_threshold:.0e}')
axes[1].set_xlabel('Log10(Variance)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Log-Variances')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

### 3.2 ANOVA F-Test: Which Joints Differ Across Pain Labels?

In [None]:
from scipy.stats import f_oneway

# Map labels to integers for analysis
label_mapping = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
df_train['label_int'] = df_train['label'].map(label_mapping)

# Perform ANOVA for each joint
anova_results = {}

for joint in remaining_joints:
    # Group data by pain level
    groups = [df_train[df_train['label_int'] == label][joint].values 
              for label in [0, 1, 2]]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    anova_results[joint] = {'f_statistic': f_stat, 'p_value': p_value}

# Convert to DataFrame
anova_df = pd.DataFrame(anova_results).T
anova_df = anova_df.sort_values('p_value')

print("ANOVA F-Test Results:")
print("=" * 70)
print("Tests if joint values differ significantly across pain labels\n")

print("Top 10 MOST discriminative joints (lowest p-values):")
for joint, row in anova_df.head(10).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
    print(f"  {joint}: F={row['f_statistic']:.2f}, p={row['p_value']:.4e} {sig}")

print(f"\nBottom 10 LEAST discriminative joints (highest p-values):")
for joint, row in anova_df.tail(10).iterrows():
    print(f"  {joint}: F={row['f_statistic']:.2f}, p={row['p_value']:.4e}")

# Count significant joints
significant_joints = anova_df[anova_df['p_value'] < 0.05].index.tolist()
non_significant_joints = anova_df[anova_df['p_value'] >= 0.05].index.tolist()

print(f"\nüìä Summary:")
print(f"   Significant joints (p < 0.05): {len(significant_joints)} / {len(remaining_joints)}")
print(f"   Non-significant joints: {len(non_significant_joints)}")

if non_significant_joints:
    print(f"\n‚ö†Ô∏è Non-significant joints (candidates for removal):")
    for joint in non_significant_joints:
        print(f"  {joint}")

### 3.3 Train/Test Distribution Mismatch

In [None]:
from scipy.stats import ks_2samp

# Perform Kolmogorov-Smirnov test for each joint
# Tests if train and test distributions are different
ks_results = {}

for joint in remaining_joints:
    # Check if joint exists in test set
    if joint not in df_test.columns:
        continue
    
    # Perform KS test
    ks_stat, p_value = ks_2samp(df_train[joint], df_test[joint])
    ks_results[joint] = {'ks_statistic': ks_stat, 'p_value': p_value}

# Convert to DataFrame
ks_df = pd.DataFrame(ks_results).T
ks_df = ks_df.sort_values('p_value')

print("Kolmogorov-Smirnov Test (Train vs Test Distribution):")
print("=" * 70)
print("Tests if train and test distributions are significantly different\n")

print("Top 10 joints with MOST DIFFERENT distributions (lowest p-values):")
for joint, row in ks_df.head(10).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
    print(f"  {joint}: KS={row['ks_statistic']:.4f}, p={row['p_value']:.4e} {sig}")

# Identify problematic joints (you mentioned joint_13-17, 19-25)
problematic_range_1 = [f'joint_{str(i).zfill(2)}' for i in range(13, 18)]
problematic_range_2 = [f'joint_{str(i).zfill(2)}' for i in range(19, 26)]
problematic_joints = [j for j in problematic_range_1 + problematic_range_2 if j in remaining_joints]

print(f"\nüîç Your identified problematic joints (joint_13-17, 19-25):")
for joint in problematic_joints:
    if joint in ks_df.index:
        row = ks_df.loc[joint]
        sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
        print(f"  {joint}: KS={row['ks_statistic']:.4f}, p={row['p_value']:.4e} {sig}")

# Count significantly different distributions
different_dist_joints = ks_df[ks_df['p_value'] < 0.05].index.tolist()

print(f"\nüìä Summary:")
print(f"   Joints with different train/test distributions (p < 0.05): {len(different_dist_joints)}")
print(f"   These joints may cause poor generalization!")

## 4. Combined Analysis: Which Joints to Remove?

In [None]:
# Create comprehensive analysis table
analysis_df = pd.DataFrame(index=remaining_joints)

# Add variance
analysis_df['variance'] = variances
analysis_df['low_variance'] = analysis_df['variance'] < low_variance_threshold

# Add ANOVA results
analysis_df['anova_f_stat'] = anova_df['f_statistic']
analysis_df['anova_p_value'] = anova_df['p_value']
analysis_df['not_discriminative'] = analysis_df['anova_p_value'] >= 0.05

# Add KS test results
analysis_df['ks_statistic'] = ks_df['ks_statistic']
analysis_df['ks_p_value'] = ks_df['p_value']
analysis_df['different_distribution'] = analysis_df['ks_p_value'] < 0.05

# Add outlier count
analysis_df['outlier_count'] = outliers_per_joint

# Flag joints with multiple issues
analysis_df['num_issues'] = (
    analysis_df['low_variance'].astype(int) +
    analysis_df['not_discriminative'].astype(int) +
    analysis_df['different_distribution'].astype(int)
)

# Sort by number of issues
analysis_df = analysis_df.sort_values('num_issues', ascending=False)

print("\n" + "=" * 70)
print("COMPREHENSIVE JOINT ANALYSIS")
print("=" * 70)

print("\nüö® Joints with MULTIPLE issues (strong candidates for removal):")
problematic = analysis_df[analysis_df['num_issues'] >= 2]
if len(problematic) > 0:
    for joint, row in problematic.iterrows():
        issues = []
        if row['low_variance']: issues.append('LOW_VAR')
        if row['not_discriminative']: issues.append('NOT_DISCR')
        if row['different_distribution']: issues.append('DIFF_DIST')
        print(f"  {joint}: {row['num_issues']} issues ‚Üí {', '.join(issues)}")
else:
    print("  None found!")

print("\n‚ö†Ô∏è Joints with ONE issue (consider for removal):")
moderate = analysis_df[analysis_df['num_issues'] == 1]
if len(moderate) > 0:
    for joint, row in moderate.iterrows():
        issues = []
        if row['low_variance']: issues.append('LOW_VAR')
        if row['not_discriminative']: issues.append('NOT_DISCR')
        if row['different_distribution']: issues.append('DIFF_DIST')
        print(f"  {joint}: {', '.join(issues)}")
else:
    print("  None found!")

print("\n‚úÖ Clean joints (no issues):")
clean = analysis_df[analysis_df['num_issues'] == 0]
print(f"  {len(clean)} joints: {', '.join(clean.index.tolist()[:10])}{'...' if len(clean) > 10 else ''}")

# Save analysis to CSV
analysis_df.to_csv('joint_analysis_results.csv')
print("\nüíæ Detailed analysis saved to 'joint_analysis_results.csv'")

## 5. Final Recommendations

In [None]:
print("\n" + "="*70)
print("FINAL RECOMMENDATIONS - November 15 Clue")
print("="*70)

# Joints to definitely remove (2+ issues)
joints_to_remove = analysis_df[analysis_df['num_issues'] >= 2].index.tolist()

# Joints to consider removing (1 issue)
joints_to_consider = analysis_df[analysis_df['num_issues'] == 1].index.tolist()

print("\n1Ô∏è‚É£ OUTLIERS:")
if p_value < 0.05:
    print("   ‚úÖ KEEP outliers - they are 'rare jewels' (associated with pain labels)")
else:
    print("   ‚ö†Ô∏è Consider REMOVING extreme outliers - they are likely 'errors'")

print("\n2Ô∏è‚É£ JOINTS TO REMOVE (high confidence):")
if joints_to_remove:
    print(f"   Total: {len(joints_to_remove)} joints")
    print(f"   Joints: {joints_to_remove}")
    print("\n   Add to preprocessing.py:")
    print(f"   df = df.drop(columns={joints_to_remove})")
else:
    print("   None! All joints have at most 1 issue.")

print("\n3Ô∏è‚É£ JOINTS TO CONSIDER REMOVING (medium confidence):")
if joints_to_consider:
    print(f"   Total: {len(joints_to_consider)} joints")
    print(f"   Joints: {joints_to_consider}")
    print("   ‚Üí Test model performance with/without these")
else:
    print("   None")

print("\n4Ô∏è‚É£ CLEAN JOINTS (keep these):")
print(f"   Total: {len(clean)} joints")
print(f"   These are the 'signal' Yoda talks about!")

print("\n5Ô∏è‚É£ NEXT STEPS:")
print("   1. Test model WITHOUT problematic joints")
print("   2. Compare validation F1 score: before vs after removal")
print("   3. If F1 improves or stays same ‚Üí keep the cleaned version")
print("   4. If F1 drops significantly ‚Üí some 'fog' joints might contain useful info")

print("\n" + "="*70)
print("üí° 'Not all that you see, is signal' - Yoda")
print("="*70)

## 6. Export Cleaned Feature List

In [None]:
# Create list of joints to keep (no issues or minor issues)
joints_to_keep = analysis_df[analysis_df['num_issues'] < 2].index.tolist()

print(f"‚úÖ Joints to KEEP: {len(joints_to_keep)} / {len(remaining_joints)}")
print(f"‚ùå Joints to REMOVE: {len(joints_to_remove)}")

# Save to file for easy import in preprocessing
with open('joints_to_remove.txt', 'w') as f:
    f.write('# Joints identified for removal by data cleaning analysis\n')
    f.write('# Based on: low variance, non-discriminative, or train/test mismatch\n')
    f.write(f"joints_to_remove = {joints_to_remove}\n")

print("\nüíæ Saved to 'joints_to_remove.txt'")
print("   Import in preprocessing.py:")
print("   exec(open('joints_to_remove.txt').read())")
print("   df = df.drop(columns=joints_to_remove)")