# Investigate Missing Papers in Temporal Split

In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
X_train = pd.read_pickle('../data/features/X_train_temporal.pkl')
X_test = pd.read_pickle('../data/features/X_test_temporal.pkl')

print(f"Total papers: {len(df)}")
print(f"Train papers: {len(X_train)}")
print(f"Test papers: {len(X_test)}")
print(f"Missing: {len(df) - len(X_train) - len(X_test)}")

In [None]:
# Check year distribution
print("\nYear distribution in full dataset:")
print(df['Year'].value_counts().sort_index())

In [None]:
# Find which papers are in train/test
train_indices = set(X_train.index)
test_indices = set(X_test.index)
split_indices = train_indices | test_indices
all_indices = set(df.index)

missing_indices = all_indices - split_indices

print(f"\nMissing papers: {len(missing_indices)}")

# Check what years the missing papers are from
missing_papers = df.loc[list(missing_indices)]
print("\nYear distribution of MISSING papers:")
print(missing_papers['Year'].value_counts().sort_index())

In [None]:
# Check the temporal split logic
train_years = [2015, 2016, 2017]
test_years = [2018, 2019, 2020]

expected_train = df[df['Year'].isin(train_years)]
expected_test = df[df['Year'].isin(test_years)]

print(f"\nExpected train papers (2015-2017): {len(expected_train)}")
print(f"Actual train papers: {len(X_train)}")
print(f"Difference: {len(expected_train) - len(X_train)}")

print(f"\nExpected test papers (2018-2020): {len(expected_test)}")
print(f"Actual test papers: {len(X_test)}")
print(f"Difference: {len(expected_test) - len(X_test)}")

In [None]:
# Sample some missing papers to see what they look like
print("\nSample of missing papers:")
print(missing_papers[['EID', 'Title', 'Year', 'Citations']].head(20))