# Data Exploration - Legal Text Decoder

This notebook explores the preprocessed dataset for the Legal Text Decoder project.

**Dataset Overview:**
- Hungarian legal texts (ÁSZF - Általános Szerződési Feltételek)
- Labeled for understandability on a scale of 1-5
- Data collected from multiple students using Label Studio

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load the Preprocessed Data

In [None]:
# Load all data
with open('../data/processed/all_data.json', 'r', encoding='utf-8') as f:
    all_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(all_data)
print(f"Total samples: {len(df)}")
df.head()

In [None]:
# Basic info
print("Dataset Info:")
print(f"  Columns: {list(df.columns)}")
print(f"  Total samples: {len(df)}")
print(f"  Source folders: {df['source_folder'].nunique()}")
print(f"  Source files: {df['source_file'].nunique()}")

## 2. Label Distribution Analysis

In [None]:
# Label distribution
label_counts = df['label'].value_counts().sort_index()
label_names = {
    1: '1-Nagyon nehezen érthető',
    2: '2-Nehezen érthető',
    3: '3-Többé/kevésbé megértem',
    4: '4-Érthető',
    5: '5-Könnyen érthető'
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
colors = sns.color_palette('RdYlGn', 5)
bars = axes[0].bar(label_counts.index, label_counts.values, color=colors)
axes[0].set_xlabel('Understandability Label', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Label Distribution', fontsize=14)
axes[0].set_xticks([1, 2, 3, 4, 5])

# Add value labels on bars
for bar, count in zip(bars, label_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                 f'{count}\n({100*count/len(df):.1f}%)', 
                 ha='center', va='bottom', fontsize=10)

# Pie chart
axes[1].pie(label_counts.values, labels=[f'{i}' for i in label_counts.index], 
            autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Label Proportions', fontsize=14)

plt.tight_layout()
plt.savefig('../notebook/label_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nLabel counts:")
for label, count in label_counts.items():
    print(f"  {label_names[label]}: {count} ({100*count/len(df):.1f}%)")

## 3. Text Length Analysis

In [None]:
# Add text length columns
df['text_length_chars'] = df['text'].str.len()
df['text_length_words'] = df['text'].str.split().str.len()

# Text length statistics
print("Text Length Statistics (characters):")
print(df['text_length_chars'].describe())

print("\nText Length Statistics (words):")
print(df['text_length_words'].describe())

In [None]:
# Text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of character lengths
axes[0].hist(df['text_length_chars'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Text Length (characters)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Distribution of Text Lengths', fontsize=14)
axes[0].axvline(df['text_length_chars'].median(), color='red', linestyle='--', label=f'Median: {df["text_length_chars"].median():.0f}')
axes[0].legend()

# Box plot by label
df.boxplot(column='text_length_chars', by='label', ax=axes[1])
axes[1].set_xlabel('Understandability Label', fontsize=12)
axes[1].set_ylabel('Text Length (characters)', fontsize=12)
axes[1].set_title('Text Length by Label', fontsize=14)
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.savefig('../notebook/text_length_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Text length by label
print("\nMean text length (chars) by label:")
for label in sorted(df['label'].unique()):
    mean_len = df[df['label'] == label]['text_length_chars'].mean()
    print(f"  Label {label}: {mean_len:.1f} chars")

## 4. Data Source Analysis

In [None]:
# Samples per source folder
folder_counts = df['source_folder'].value_counts()

plt.figure(figsize=(14, 6))
bars = plt.bar(range(len(folder_counts)), folder_counts.values)
plt.xticks(range(len(folder_counts)), folder_counts.index, rotation=45, ha='right')
plt.xlabel('Source Folder (Student ID)', fontsize=12)
plt.ylabel('Number of Samples', fontsize=12)
plt.title('Samples per Data Source', fontsize=14)

# Add count labels
for i, (bar, count) in enumerate(zip(bars, folder_counts.values)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             str(count), ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('../notebook/samples_per_source.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nTotal source folders: {len(folder_counts)}")
print(f"Samples per folder: min={folder_counts.min()}, max={folder_counts.max()}, mean={folder_counts.mean():.1f}")

In [None]:
# Label distribution by source (heatmap)
source_label_dist = pd.crosstab(df['source_folder'], df['label'], normalize='index') * 100

plt.figure(figsize=(10, 12))
sns.heatmap(source_label_dist, annot=True, fmt='.0f', cmap='RdYlGn', 
            cbar_kws={'label': 'Percentage'})
plt.xlabel('Understandability Label', fontsize=12)
plt.ylabel('Source Folder', fontsize=12)
plt.title('Label Distribution (%) by Data Source', fontsize=14)
plt.tight_layout()
plt.savefig('../notebook/label_by_source_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Sample Examples

In [None]:
# Show examples from each label
print("Sample texts for each understandability level:\n")
print("="*80)

for label in sorted(df['label'].unique()):
    sample = df[df['label'] == label].sample(1).iloc[0]
    print(f"\n[LABEL {label}: {label_names[label]}]")
    print("-"*80)
    text = sample['text']
    # Truncate long texts
    if len(text) > 500:
        text = text[:500] + "..."
    print(text)
    print(f"\nSource: {sample['source_folder']}/{sample['source_file']}")
    print(f"Length: {len(sample['text'])} chars")
    print("="*80)

## 6. Train/Val/Test Split Verification

In [None]:
# Load splits
splits = {}
for split_name in ['train', 'val', 'test']:
    with open(f'../data/processed/{split_name}.json', 'r', encoding='utf-8') as f:
        splits[split_name] = pd.DataFrame(json.load(f))

# Verify split sizes
print("Split sizes:")
for name, split_df in splits.items():
    print(f"  {name}: {len(split_df)} ({100*len(split_df)/len(df):.1f}%)")

# Check for data leakage (no overlap between splits)
train_texts = set(splits['train']['text'])
val_texts = set(splits['val']['text'])
test_texts = set(splits['test']['text'])

train_val_overlap = len(train_texts.intersection(val_texts))
train_test_overlap = len(train_texts.intersection(test_texts))
val_test_overlap = len(val_texts.intersection(test_texts))

print(f"\nData leakage check:")
print(f"  Train-Val overlap: {train_val_overlap}")
print(f"  Train-Test overlap: {train_test_overlap}")
print(f"  Val-Test overlap: {val_test_overlap}")

In [None]:
# Label distribution across splits
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, (name, split_df) in enumerate(splits.items()):
    counts = split_df['label'].value_counts().sort_index()
    colors = sns.color_palette('RdYlGn', 5)
    axes[i].bar(counts.index, counts.values, color=colors)
    axes[i].set_xlabel('Label', fontsize=12)
    axes[i].set_ylabel('Count', fontsize=12)
    axes[i].set_title(f'{name.capitalize()} Set (n={len(split_df)})', fontsize=14)
    axes[i].set_xticks([1, 2, 3, 4, 5])
    
    # Add percentage labels
    for j, (label, count) in enumerate(counts.items()):
        axes[i].text(label, count + 1, f'{100*count/len(split_df):.1f}%', 
                     ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../notebook/split_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Summary Statistics

In [None]:
# Summary table
summary = {
    'Metric': [
        'Total Samples',
        'Training Samples',
        'Validation Samples',
        'Test Samples',
        'Number of Labels',
        'Source Folders',
        'Min Text Length (chars)',
        'Max Text Length (chars)',
        'Mean Text Length (chars)',
        'Median Text Length (chars)',
    ],
    'Value': [
        len(df),
        len(splits['train']),
        len(splits['val']),
        len(splits['test']),
        df['label'].nunique(),
        df['source_folder'].nunique(),
        df['text_length_chars'].min(),
        df['text_length_chars'].max(),
        f"{df['text_length_chars'].mean():.1f}",
        f"{df['text_length_chars'].median():.0f}",
    ]
}

summary_df = pd.DataFrame(summary)
print("\nDataset Summary:")
print(summary_df.to_string(index=False))