# Data Splitting for Framing Drift Analysis

**Purpose:** Split LRC_articles.parquet into train/validation/test/drift/human_validation sets

**Splits:**
- Train: 2015-2016 (for model fine-tuning)
- Validation: 2017 H1 (for threshold tuning)
- Test: 2017 H2 (for model evaluation)
- Drift Analysis: 2018-2021 (for measuring framing drift)
- Human Validation: 100 stratified articles (for model validation)

In [None]:
import pandas as pd
import numpy as np
import os

# Create directories if they don't exist
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/human_labels', exist_ok=True)

## Load Data

In [None]:
# Load parquet file
df = pd.read_parquet('LRC_articles.parquet')

# Convert date and add temporal columns
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['year_quarter'] = df['date'].dt.to_period('Q')

# Add word count for stratified sampling
df['word_count'] = df['content'].fillna('').str.split().str.len()

# Create unique ID for each article (using index)
df['article_id'] = df.index

print(f"Total articles loaded: {len(df):,}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Outlets: {df['outlet_name'].nunique()}")

## Create Temporal Splits

In [None]:
# Temporal splits based on year
train_df = df[df['year'].isin([2015, 2016])].copy()
val_df = df[(df['year'] == 2017) & (df['date'] < '2017-07-01')].copy()
test_df = df[(df['year'] == 2017) & (df['date'] >= '2017-07-01')].copy()
drift_df = df[df['year'].isin([2018, 2019, 2020, 2021])].copy()

# Add split labels
train_df['split'] = 'train'
val_df['split'] = 'validation'
test_df['split'] = 'test'
drift_df['split'] = 'drift'

print("Temporal Splits:")
print(f"Train (2015-2016):       {len(train_df):,} articles")
print(f"Validation (2017 H1):    {len(val_df):,} articles")
print(f"Test (2017 H2):          {len(test_df):,} articles")
print(f"Drift (2018-2021):       {len(drift_df):,} articles")

## Sample 100 Articles for Human Validation

Stratified by:
- Political bias (Left/Center/Right)
- Time period (Early/Mid/Late)
- Article length (Short/Medium/Long)

In [None]:
# Create stratification columns
drift_df['bias_category'] = drift_df['bias'].map({
    'Left': 'Left',
    'Lean Left': 'Center',
    'Center': 'Center',
    'Lean Right': 'Right',
    'Right': 'Right'
})

drift_df['time_period'] = pd.cut(
    drift_df['year'],
    bins=[2017, 2018, 2020, 2022],
    labels=['Early', 'Mid', 'Late']
)

drift_df['length_bin'] = pd.cut(
    drift_df['word_count'],
    bins=[0, 400, 800, 100000],
    labels=['Short', 'Medium', 'Long']
)

# Stratified sample
human_val_df = (
    drift_df
    .dropna(subset=['bias_category', 'time_period', 'length_bin'])
    .groupby(['bias_category', 'time_period', 'length_bin'], group_keys=False)
    .apply(lambda x: x.sample(min(len(x), 4), random_state=42))
    .sample(n=min(100, len(drift_df)), random_state=42)
).copy()

human_val_df['split'] = 'human_validation'

print(f"\nHuman Validation Sample: {len(human_val_df)} articles")
print("\nBreakdown by Bias:")
print(human_val_df['bias_category'].value_counts())
print("\nBreakdown by Time Period:")
print(human_val_df['time_period'].value_counts())
print("\nBreakdown by Length:")
print(human_val_df['length_bin'].value_counts())

## Remove Human Validation Articles from Drift Set

In [None]:
# Remove human validation articles from drift analysis set
drift_df = drift_df[~drift_df.index.isin(human_val_df.index)].copy()

print(f"Drift Analysis (after removing human val): {len(drift_df):,} articles")

## Save All Splits

In [None]:
# Save to parquet
train_df.to_parquet('data/processed/train.parquet', index=False)
val_df.to_parquet('data/processed/validation.parquet', index=False)
test_df.to_parquet('data/processed/test.parquet', index=False)
drift_df.to_parquet('data/processed/drift_analysis.parquet', index=False)
human_val_df.to_parquet('data/processed/human_validation_sample.parquet', index=False)

print("âœ… Saved all parquet files to data/processed/")

## Create Individual Text Files for Human Validation

Export each article to a separate .txt file for easier reading

In [None]:
import json

# Load frame names from model
with open('best/frames.json', 'r') as f:
    frames = json.load(f)

print(f"Loaded {len(frames)} frame categories: {frames}")

# Create annotation dataframe
annotation_df = human_val_df[['article_id', 'outlet_name', 'bias', 'date', 'word_count']].copy()
annotation_df['date'] = annotation_df['date'].dt.strftime('%Y-%m-%d')

# Add empty columns for each frame (annotators will fill with 0 or 1)
for frame in frames:
    annotation_df[f'frame_{frame}'] = ''

# Add notes and confidence columns
annotation_df['notes'] = ''
annotation_df['confidence'] = ''  # low/medium/high

# Save to CSV
annotation_df.to_csv('data/human_labels/annotation_template.csv', index=False)

print(f"\nâœ… Created annotation template with {len(annotation_df.columns)} columns")
print(f"   - 5 metadata columns (article_id, outlet, bias, date, word_count)")
print(f"   - {len(frames)} frame columns (frame_*)")
print(f"   - 2 annotation columns (notes, confidence)")
print(f"\nðŸ“‹ Annotation Workflow:")
print(f"   1. Open article: data/human_labels/articles/article_[article_id].txt")
print(f"   2. Read the article")
print(f"   3. Mark frames in annotation_template.csv (enter 0 or 1)")
print(f"   4. Add notes/confidence if needed")
print(f"   5. Save as annotation_[your_name].csv")

## Create Annotation Spreadsheet (without full text)

Create a CSV with metadata + 14 frame columns for annotation

In [None]:
# Create directory for article text files
os.makedirs('data/human_labels/articles', exist_ok=True)

# Create individual text files
for idx, row in human_val_df.iterrows():
    filename = f"data/human_labels/articles/article_{row['article_id']}.txt"
    
    with open(filename, 'w', encoding='utf-8') as f:
        # Header with metadata
        f.write("="*70 + "\n")
        f.write(f"Article ID: {row['article_id']}\n")
        f.write(f"Outlet: {row['outlet_name']}\n")
        f.write(f"Political Bias: {row['bias']}\n")
        f.write(f"Date: {row['date'].strftime('%Y-%m-%d')}\n")
        f.write(f"Word Count: {row['word_count']}\n")
        f.write("="*70 + "\n\n")
        
        # Article content
        f.write(row['content'])

print(f"âœ… Created {len(human_val_df)} text files in data/human_labels/articles/")

## Summary Statistics

In [None]:
print("=" * 60)
print("FINAL SPLIT SUMMARY")
print("=" * 60)
print(f"Train:             {len(train_df):,} articles (2015-2016)")
print(f"Validation:        {len(val_df):,} articles (2017 H1)")
print(f"Test:              {len(test_df):,} articles (2017 H2)")
print(f"Drift Analysis:    {len(drift_df):,} articles (2018-2021)")
print(f"Human Validation:  {len(human_val_df):,} articles (sampled from drift)")
print("=" * 60)
print(f"Total:             {len(train_df) + len(val_df) + len(test_df) + len(drift_df) + len(human_val_df):,}")
print("=" * 60)

## Verification: Check for Overlaps

In [None]:
# Verify no overlaps between splits
train_ids = set(train_df.index)
val_ids = set(val_df.index)
test_ids = set(test_df.index)
drift_ids = set(drift_df.index)
human_val_ids = set(human_val_df.index)

print("Checking for overlaps between splits...")
print(f"Train âˆ© Val: {len(train_ids & val_ids)}")
print(f"Train âˆ© Test: {len(train_ids & test_ids)}")
print(f"Train âˆ© Drift: {len(train_ids & drift_ids)}")
print(f"Val âˆ© Test: {len(val_ids & test_ids)}")
print(f"Val âˆ© Drift: {len(val_ids & drift_ids)}")
print(f"Test âˆ© Drift: {len(test_ids & drift_ids)}")
print(f"Drift âˆ© Human Val: {len(drift_ids & human_val_ids)}")
print("\nâœ… All should be 0 (except Drift âˆ© Human Val)")