# Rebuild Temporal Split: 2010-2021

New split:
- Train: 2010-2017
- Test: 2018-2021
- Exclude: 2022-2025 (too recent)

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
# Load cleaned data
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Total papers: {len(df)}")

# Load all features
X = pd.read_pickle('../data/features/X_all.pkl')
y_classification = pd.read_pickle('../data/features/y_classification.pkl')
y_regression = pd.read_pickle('../data/features/y_regression.pkl')
y_regression_log = pd.read_pickle('../data/features/y_regression_log.pkl')
metadata = pd.read_pickle('../data/features/metadata.pkl')

print(f"Features: {X.shape}")

## 2. Filter to 2010-2021

In [None]:
# Keep only papers from 2010-2021
valid_years = list(range(2010, 2022))  # 2010-2021
valid_mask = df['Year'].isin(valid_years)

print(f"Papers by year (before filtering):")
print(df['Year'].value_counts().sort_index())

print(f"\nPapers to keep (2010-2021): {valid_mask.sum()}")
print(f"Papers to exclude (2022-2025): {(~valid_mask).sum()}")

## 3. Filter All Data

In [None]:
# Filter all datasets
X_filtered = X[valid_mask]
y_class_filtered = y_classification[valid_mask]
y_reg_filtered = y_regression[valid_mask]
y_reg_log_filtered = y_regression_log[valid_mask]
metadata_filtered = metadata[valid_mask]
df_filtered = df[valid_mask]

print(f"Filtered dataset: {len(X_filtered)} papers")
print(f"\nYear distribution (after filtering):")
print(df_filtered['Year'].value_counts().sort_index())

## 4. Create New Temporal Split

In [None]:
# New temporal split
train_years = list(range(2010, 2018))  # 2010-2017
test_years = list(range(2018, 2022))   # 2018-2021

train_mask = df_filtered['Year'].isin(train_years)
test_mask = df_filtered['Year'].isin(test_years)

# Split features
X_train_temporal = X_filtered[train_mask]
X_test_temporal = X_filtered[test_mask]

# Split classification targets
y_train_cls_temporal = y_class_filtered[train_mask]
y_test_cls_temporal = y_class_filtered[test_mask]

# Split regression targets
y_train_reg_temporal = y_reg_log_filtered[train_mask]
y_test_reg_temporal = y_reg_log_filtered[test_mask]

# Split metadata
metadata_train = metadata_filtered[train_mask]
metadata_test = metadata_filtered[test_mask]

print(f"="*60)
print("NEW TEMPORAL SPLIT")
print(f"="*60)
print(f"\nTrain (2010-2017): {len(X_train_temporal)} papers")
print(f"Test (2018-2021): {len(X_test_temporal)} papers")
print(f"Total: {len(X_train_temporal) + len(X_test_temporal)} papers")

print(f"\nTrain years:")
print(df_filtered[train_mask]['Year'].value_counts().sort_index())

print(f"\nTest years:")
print(df_filtered[test_mask]['Year'].value_counts().sort_index())

## 5. Validate Split

In [None]:
print(f"Validation checks:")
print(f"  Train + Test = Total: {len(X_train_temporal) + len(X_test_temporal) == len(X_filtered)}")

# Check for overlap
train_indices = set(X_train_temporal.index)
test_indices = set(X_test_temporal.index)
overlap = train_indices & test_indices
print(f"  No overlap: {len(overlap) == 0}")

# Check target distributions
print(f"\nClassification targets:")
print(f"  Train high-impact: {y_train_cls_temporal.sum()} ({y_train_cls_temporal.mean()*100:.1f}%)")
print(f"  Test high-impact: {y_test_cls_temporal.sum()} ({y_test_cls_temporal.mean()*100:.1f}%)")

print(f"\nRegression targets (log-transformed):")
print(f"  Train mean: {y_train_reg_temporal.mean():.2f}")
print(f"  Test mean: {y_test_reg_temporal.mean():.2f}")

## 6. Save New Split

In [None]:
output_dir = Path('../data/features')

# Save temporal splits
X_train_temporal.to_pickle(output_dir / 'X_train_temporal.pkl')
X_test_temporal.to_pickle(output_dir / 'X_test_temporal.pkl')
y_train_cls_temporal.to_pickle(output_dir / 'y_train_cls_temporal.pkl')
y_test_cls_temporal.to_pickle(output_dir / 'y_test_cls_temporal.pkl')
y_train_reg_temporal.to_pickle(output_dir / 'y_train_reg_temporal.pkl')
y_test_reg_temporal.to_pickle(output_dir / 'y_test_reg_temporal.pkl')
metadata_train.to_pickle(output_dir / 'metadata_train.pkl')
metadata_test.to_pickle(output_dir / 'metadata_test.pkl')

print("âœ“ All temporal split files saved!")
print(f"\nSaved to: {output_dir}")
print("  - X_train_temporal.pkl")
print("  - X_test_temporal.pkl")
print("  - y_train_cls_temporal.pkl")
print("  - y_test_cls_temporal.pkl")
print("  - y_train_reg_temporal.pkl")
print("  - y_test_reg_temporal.pkl")
print("  - metadata_train.pkl")
print("  - metadata_test.pkl")

## Summary

In [None]:
print("="*60)
print("TEMPORAL SPLIT REBUILT")
print("="*60)
print(f"\nOld split (2015-2020): {2545 + 3573} papers")
print(f"New split (2010-2021): {len(X_train_temporal) + len(X_test_temporal)} papers")
print(f"\nAdded papers: {len(X_train_temporal) + len(X_test_temporal) - (2545 + 3573)}")
print(f"\nExcluded (2022-2025): {len(df) - len(df_filtered)} papers")
print(f"\nReady for model retraining!")