In [2]:
# Setup: imports and dataset loading
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils import set_seed
import os

set_seed(42)

# Load dataset from HuggingFace
print("Loading dataset FutureMa/EvasionBench...")
ds = load_dataset("FutureMa/EvasionBench")
if isinstance(ds, dict):
    ds = ds[list(ds.keys())[0]]
df = ds.to_pandas()
print("Dataset shape:", df.shape)

# Save local copy if not exists
os.makedirs("data/raw", exist_ok=True)
if not os.path.exists("data/raw/evasionbench.parquet"):
    df.to_parquet("data/raw/evasionbench.parquet", index=False)
    print("Saved data/raw/evasionbench.parquet")

# Quick preview
print(df.columns.tolist())
df.head()

ModuleNotFoundError: No module named 'datasets'

In [None]:
# Dataset Overview
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal samples: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nSample row:")
df.head(1).T

In [None]:
# Label Distribution Analysis
print("=" * 60)
print("LABEL DISTRIBUTION")
print("=" * 60)

label_counts = df['eva4b_label'].value_counts()
label_pcts = df['eva4b_label'].value_counts(normalize=True) * 100

print("\nLabel counts:")
for label, count in label_counts.items():
    print(f"  {label}: {count:,} ({label_pcts[label]:.1f}%)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
colors = {'direct': '#2ecc71', 'intermediate': '#f39c12', 'fully_evasive': '#e74c3c'}
ax1 = axes[0]
bars = ax1.bar(label_counts.index, label_counts.values, color=[colors.get(x, '#3498db') for x in label_counts.index])
ax1.set_xlabel('Evasion Label', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Label Distribution (Bar Chart)', fontsize=14, fontweight='bold')
for bar, count, pct in zip(bars, label_counts.values, label_pcts.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, 
             f'{count:,}\n({pct:.1f}%)', ha='center', va='bottom', fontsize=10)
ax1.set_ylim(0, max(label_counts.values) * 1.15)

# Pie chart
ax2 = axes[1]
wedges, texts, autotexts = ax2.pie(label_counts.values, labels=label_counts.index, 
                                    autopct='%1.1f%%', colors=[colors.get(x, '#3498db') for x in label_counts.index],
                                    explode=[0.02, 0.02, 0.1], startangle=90)
ax2.set_title('Label Distribution (Pie Chart)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('notebooks/figures/01_label_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚ö†Ô∏è Class imbalance detected: 'fully_evasive' is only 3.7% of data")

In [None]:
# Text Length Analysis
print("=" * 60)
print("TEXT LENGTH ANALYSIS")
print("=" * 60)

# Calculate lengths
df['question_chars'] = df['question'].str.len()
df['question_words'] = df['question'].str.split().str.len()
df['answer_chars'] = df['answer'].str.len()
df['answer_words'] = df['answer'].str.split().str.len()

# Basic statistics
print("\nQuestion length statistics:")
print(f"  Characters - Mean: {df['question_chars'].mean():.1f}, Median: {df['question_chars'].median():.1f}, Std: {df['question_chars'].std():.1f}")
print(f"  Words - Mean: {df['question_words'].mean():.1f}, Median: {df['question_words'].median():.1f}, Std: {df['question_words'].std():.1f}")

print("\nAnswer length statistics:")
print(f"  Characters - Mean: {df['answer_chars'].mean():.1f}, Median: {df['answer_chars'].median():.1f}, Std: {df['answer_chars'].std():.1f}")
print(f"  Words - Mean: {df['answer_words'].mean():.1f}, Median: {df['answer_words'].median():.1f}, Std: {df['answer_words'].std():.1f}")

# Answer length by evasion category
print("\nAnswer length by evasion category:")
length_by_label = df.groupby('eva4b_label').agg({
    'answer_chars': ['mean', 'median', 'std'],
    'answer_words': ['mean', 'median', 'std']
}).round(1)
print(length_by_label)

In [None]:
# Text Length Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Question length distribution
ax1 = axes[0, 0]
for label, color in colors.items():
    subset = df[df['eva4b_label'] == label]
    ax1.hist(subset['question_words'], bins=50, alpha=0.6, label=label, color=color)
ax1.set_xlabel('Question Length (words)', fontsize=11)
ax1.set_ylabel('Frequency', fontsize=11)
ax1.set_title('Question Length Distribution by Label', fontsize=12, fontweight='bold')
ax1.legend()
ax1.set_xlim(0, 200)

# Answer length distribution
ax2 = axes[0, 1]
for label, color in colors.items():
    subset = df[df['eva4b_label'] == label]
    ax2.hist(subset['answer_words'], bins=50, alpha=0.6, label=label, color=color)
ax2.set_xlabel('Answer Length (words)', fontsize=11)
ax2.set_ylabel('Frequency', fontsize=11)
ax2.set_title('Answer Length Distribution by Label', fontsize=12, fontweight='bold')
ax2.legend()
ax2.set_xlim(0, 500)

# Box plots - Answer length by label
ax3 = axes[1, 0]
df.boxplot(column='answer_words', by='eva4b_label', ax=ax3)
ax3.set_xlabel('Evasion Label', fontsize=11)
ax3.set_ylabel('Answer Length (words)', fontsize=11)
ax3.set_title('Answer Length by Evasion Category', fontsize=12, fontweight='bold')
plt.suptitle('')  # Remove automatic title

# Box plots - Question length by label
ax4 = axes[1, 1]
df.boxplot(column='question_words', by='eva4b_label', ax=ax4)
ax4.set_xlabel('Evasion Label', fontsize=11)
ax4.set_ylabel('Question Length (words)', fontsize=11)
ax4.set_title('Question Length by Evasion Category', fontsize=12, fontweight='bold')
plt.suptitle('')

plt.tight_layout()
plt.savefig('notebooks/figures/01_text_length_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Statistical Tests for Length Differences
from scipy import stats

print("=" * 60)
print("STATISTICAL TESTS: TEXT LENGTH HYPOTHESES")
print("=" * 60)

# H1: Evasive answers are longer than direct answers
print("\nüìä HYPOTHESIS 1: 'Evasive answers are longer than direct answers'")
print("-" * 60)

direct_lengths = df[df['eva4b_label'] == 'direct']['answer_words']
evasive_lengths = df[df['eva4b_label'] == 'fully_evasive']['answer_words']
intermediate_lengths = df[df['eva4b_label'] == 'intermediate']['answer_words']

# Kruskal-Wallis test (non-parametric)
stat, p_value = stats.kruskal(direct_lengths, intermediate_lengths, evasive_lengths)
print(f"\nKruskal-Wallis test (all 3 groups):")
print(f"  H-statistic: {stat:.2f}")
print(f"  p-value: {p_value:.2e}")
print(f"  Result: {'Significant difference (p < 0.05)' if p_value < 0.05 else 'No significant difference'}")

# Mann-Whitney U test (direct vs fully_evasive)
stat2, p_value2 = stats.mannwhitneyu(direct_lengths, evasive_lengths, alternative='two-sided')
print(f"\nMann-Whitney U test (direct vs fully_evasive):")
print(f"  U-statistic: {stat2:.2f}")
print(f"  p-value: {p_value2:.2e}")
print(f"  Result: {'Significant difference (p < 0.05)' if p_value2 < 0.05 else 'No significant difference'}")

# Mean comparison
print(f"\nMean answer lengths:")
print(f"  Direct: {direct_lengths.mean():.1f} words")
print(f"  Intermediate: {intermediate_lengths.mean():.1f} words")
print(f"  Fully Evasive: {evasive_lengths.mean():.1f} words")

# Conclusion
if p_value2 < 0.05:
    if evasive_lengths.mean() > direct_lengths.mean():
        print("\n‚úÖ H1 SUPPORTED: Evasive answers are significantly longer than direct answers")
    else:
        print("\n‚ùå H1 REJECTED: Evasive answers are significantly SHORTER than direct answers")
else:
    print("\n‚ö†Ô∏è H1 NOT SUPPORTED: No significant length difference detected")

In [None]:
# H2: Questions receiving evasive answers differ in structure/length
print("\nüìä HYPOTHESIS 2: 'Questions receiving evasive answers differ in length'")
print("-" * 60)

direct_q_lengths = df[df['eva4b_label'] == 'direct']['question_words']
evasive_q_lengths = df[df['eva4b_label'] == 'fully_evasive']['question_words']

# Mann-Whitney U test
stat_q, p_value_q = stats.mannwhitneyu(direct_q_lengths, evasive_q_lengths, alternative='two-sided')
print(f"\nMann-Whitney U test (question lengths):")
print(f"  U-statistic: {stat_q:.2f}")
print(f"  p-value: {p_value_q:.2e}")
print(f"  Result: {'Significant difference (p < 0.05)' if p_value_q < 0.05 else 'No significant difference'}")

print(f"\nMean question lengths:")
print(f"  Direct answers: {direct_q_lengths.mean():.1f} words")
print(f"  Evasive answers: {evasive_q_lengths.mean():.1f} words")

if p_value_q < 0.05:
    print("\n‚úÖ H2 SUPPORTED: Questions receiving evasive answers have significantly different lengths")
else:
    print("\n‚ö†Ô∏è H2 NOT SUPPORTED: No significant difference in question lengths")

In [None]:
# Data Quality Checks
print("=" * 60)
print("DATA QUALITY CHECKS")
print("=" * 60)

# 1. UID uniqueness
print("\n1. UID UNIQUENESS:")
unique_uids = df['uid'].nunique()
total_rows = len(df)
print(f"   Unique UIDs: {unique_uids:,} / {total_rows:,}")
if unique_uids == total_rows:
    print("   ‚úÖ All UIDs are unique")
else:
    print(f"   ‚ö†Ô∏è Found {total_rows - unique_uids} duplicate UIDs")

# 2. Empty strings
print("\n2. EMPTY STRING CHECK:")
empty_questions = (df['question'].str.strip() == '').sum()
empty_answers = (df['answer'].str.strip() == '').sum()
print(f"   Empty questions: {empty_questions}")
print(f"   Empty answers: {empty_answers}")
if empty_questions == 0 and empty_answers == 0:
    print("   ‚úÖ No empty strings found")

# 3. Duplicate detection (exact)
print("\n3. EXACT DUPLICATE CHECK:")
dup_questions = df['question'].duplicated().sum()
dup_answers = df['answer'].duplicated().sum()
dup_pairs = df.duplicated(subset=['question', 'answer']).sum()
print(f"   Duplicate questions: {dup_questions} ({dup_questions/len(df)*100:.2f}%)")
print(f"   Duplicate answers: {dup_answers} ({dup_answers/len(df)*100:.2f}%)")
print(f"   Duplicate Q&A pairs: {dup_pairs} ({dup_pairs/len(df)*100:.2f}%)")

# 4. Anomalous lengths (outliers)
print("\n4. OUTLIER DETECTION (using IQR method):")
for col in ['question_words', 'answer_words']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"   {col}: {outliers} outliers ({outliers/len(df)*100:.2f}%)")
    print(f"      Range: [{max(0, lower):.0f}, {upper:.0f}] words")

# 5. Very short/long texts
print("\n5. EXTREME LENGTH SAMPLES:")
print(f"   Shortest answer: {df['answer_words'].min()} words")
print(f"   Longest answer: {df['answer_words'].max()} words")
print(f"   Shortest question: {df['question_words'].min()} words")
print(f"   Longest question: {df['question_words'].max()} words")

In [None]:
# Summary Statistics Table
print("=" * 60)
print("SUMMARY STATISTICS TABLE")
print("=" * 60)

summary_stats = pd.DataFrame({
    'Metric': ['Total Samples', 'Direct', 'Intermediate', 'Fully Evasive',
               'Avg Question Length (words)', 'Avg Answer Length (words)',
               'Missing Values', 'Duplicate Q&A Pairs'],
    'Value': [f"{len(df):,}", 
              f"{label_counts['direct']:,} ({label_pcts['direct']:.1f}%)",
              f"{label_counts['intermediate']:,} ({label_pcts['intermediate']:.1f}%)",
              f"{label_counts['fully_evasive']:,} ({label_pcts['fully_evasive']:.1f}%)",
              f"{df['question_words'].mean():.1f}",
              f"{df['answer_words'].mean():.1f}",
              "0",
              f"{dup_pairs} ({dup_pairs/len(df)*100:.2f}%)"]
})

print(summary_stats.to_string(index=False))

# Save summary
summary_stats.to_csv('notebooks/figures/01_summary_statistics.csv', index=False)
print("\n‚úÖ Summary saved to notebooks/figures/01_summary_statistics.csv")

## Key Findings

### Data Quality
- ‚úÖ Dataset is clean with no missing values
- ‚úÖ All UIDs are unique
- ‚ö†Ô∏è Some duplicate Q&A pairs exist (~X%)
- ‚ö†Ô∏è Significant class imbalance: fully_evasive is only 3.7% of data

### Hypothesis Test Results
1. **H1: Evasive answers are longer than direct answers**
   - Result: [To be filled after execution]
   
2. **H2: Questions receiving evasive answers differ in length**
   - Result: [To be filled after execution]

### Next Steps
- Proceed to Notebook 02 for linguistic pattern analysis
- Investigate duplicate Q&A pairs
- Consider class imbalance handling strategies for modeling