# Data Quality Check for Cleaned Dataset

Comprehensive check for:
1. Missing values
2. Duplicates
3. Outliers
4. Data consistency
5. Feature integrity

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Load Data

In [None]:
# Load cleaned data
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Cleaned data: {df.shape[0]} rows × {df.shape[1]} columns")

# Load features
X = pd.read_pickle('../data/features/X_all.pkl')
y_class = pd.read_pickle('../data/features/y_classification.pkl')
y_reg = pd.read_pickle('../data/features/y_regression.pkl')

print(f"Feature matrix: {X.shape}")
print(f"Classification target: {len(y_class)}")
print(f"Regression target: {len(y_reg)}")

## 1. Basic Data Integrity

In [None]:
print("="*80)
print("BASIC DATA INTEGRITY")
print("="*80)

# Check for duplicates
duplicates = df.duplicated(subset='EID')
print(f"\nDuplicate EIDs: {duplicates.sum()}")
if duplicates.sum() > 0:
    print(f"  ⚠️  WARNING: Found {duplicates.sum()} duplicate papers")
    print("\nDuplicate papers:")
    print(df[duplicates][['EID', 'Title', 'Year']])
else:
    print(f"  ✓ No duplicates found")

# Check index alignment
print(f"\nIndex alignment:")
print(f"  df: {len(df)}")
print(f"  X: {len(X)}")
print(f"  y_class: {len(y_class)}")
print(f"  y_reg: {len(y_reg)}")

if len(df) == len(X) == len(y_class) == len(y_reg):
    print(f"  ✓ All indices aligned")
else:
    print(f"  ⚠️  WARNING: Index mismatch!")

## 2. Missing Values

In [None]:
print("="*80)
print("MISSING VALUES")
print("="*80)

# Check missing in key columns
key_columns = ['EID', 'Title', 'Year', 'Abstract', 'Citations',
               'Number of Authors', 'Number of Institutions', 'Number of Countries/Regions',
               'SNIP (publication year)', 'CiteScore (publication year)', 'SJR (publication year)']

print("\nMissing values in key columns:")
missing_summary = []
for col in key_columns:
    if col in df.columns:
        missing = df[col].isnull().sum()
        missing_pct = (missing / len(df)) * 100
        missing_summary.append({
            'Column': col,
            'Missing': missing,
            'Percentage': f"{missing_pct:.1f}%"
        })

missing_df = pd.DataFrame(missing_summary)
print(missing_df.to_string(index=False))

# Feature matrix missing values
print(f"\nMissing values in feature matrix:")
missing_features = X.isnull().sum()
if missing_features.sum() > 0:
    print(f"  ⚠️  WARNING: {missing_features.sum()} missing values")
    print(f"\nColumns with missing values:")
    print(missing_features[missing_features > 0])
else:
    print(f"  ✓ No missing values")

## 3. Citation Statistics

In [None]:
print("="*80)
print("CITATION STATISTICS")
print("="*80)

print(f"\nCitation distribution:")
print(df['Citations'].describe())

# Check for negative citations
negative = (df['Citations'] < 0).sum()
if negative > 0:
    print(f"\n⚠️  WARNING: {negative} papers with negative citations!")
    print(df[df['Citations'] < 0][['EID', 'Title', 'Citations']])
else:
    print(f"\n✓ No negative citations")

# Extreme outliers
p999 = df['Citations'].quantile(0.999)
extreme = df[df['Citations'] > p999]
print(f"\nExtreme outliers (>99.9 percentile = {p999:.0f}):")
print(f"  Count: {len(extreme)}")
if len(extreme) > 0:
    print(f"\nTop 10 most cited papers:")
    top_cited = df.nlargest(10, 'Citations')[['Title', 'Year', 'Citations']]
    print(top_cited.to_string())

## 4. Year Distribution

In [None]:
print("="*80)
print("YEAR DISTRIBUTION")
print("="*80)

print(f"\nPapers by year:")
year_counts = df['Year'].value_counts().sort_index()
print(year_counts)

# Check for unexpected years
expected_years = [2015, 2016, 2017, 2018, 2019, 2020]
unexpected = df[~df['Year'].isin(expected_years)]
if len(unexpected) > 0:
    print(f"\n⚠️  WARNING: {len(unexpected)} papers with unexpected years:")
    print(unexpected[['EID', 'Title', 'Year']])
else:
    print(f"\n✓ All years within expected range (2015-2020)")

## 5. Author Features

In [None]:
print("="*80)
print("AUTHOR FEATURES")
print("="*80)

print(f"\nNumber of authors:")
print(df['Number of Authors'].describe())

# Check for zero authors
zero_authors = (df['Number of Authors'] == 0).sum()
if zero_authors > 0:
    print(f"\n⚠️  WARNING: {zero_authors} papers with 0 authors!")
else:
    print(f"\n✓ No papers with 0 authors")

print(f"\nNumber of institutions:")
print(df['Number of Institutions'].describe())

print(f"\nNumber of countries:")
print(df['Number of Countries/Regions'].describe())

# Check logical consistency
inconsistent = df[df['Number of Institutions'] > df['Number of Authors']]
if len(inconsistent) > 0:
    print(f"\n⚠️  WARNING: {len(inconsistent)} papers with more institutions than authors!")
    print(inconsistent[['Title', 'Number of Authors', 'Number of Institutions']].head())
else:
    print(f"\n✓ No papers with more institutions than authors")

## 6. Venue Features

In [None]:
print("="*80)
print("VENUE FEATURES")
print("="*80)

venue_cols = ['SNIP (publication year)', 'CiteScore (publication year)', 'SJR (publication year)']

for col in venue_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Non-null: {df[col].notna().sum()} ({df[col].notna().sum()/len(df)*100:.1f}%)")
        if df[col].notna().sum() > 0:
            print(df[col].describe())
            
            # Check for negative
            negative = (df[col] < 0).sum()
            if negative > 0:
                print(f"  ⚠️  WARNING: {negative} negative values!")

## 7. Text Features (Abstracts)

In [None]:
print("="*80)
print("TEXT FEATURES")
print("="*80)

# Abstract lengths
df['abstract_length'] = df['Abstract'].fillna('').str.len()
print(f"\nAbstract lengths:")
print(df['abstract_length'].describe())

# Empty abstracts
empty = (df['abstract_length'] == 0).sum()
if empty > 0:
    print(f"\n⚠️  WARNING: {empty} papers with empty abstracts!")
    print(df[df['abstract_length'] == 0][['EID', 'Title', 'Year']])
else:
    print(f"\n✓ No empty abstracts")

# Very short abstracts
short = (df['abstract_length'] < 50).sum()
if short > 0:
    print(f"\nℹ️  {short} papers with very short abstracts (<50 chars):")
    print(df[df['abstract_length'] < 50][['Title', 'abstract_length', 'Abstract']].head())

## 8. Feature Matrix Validation

In [None]:
print("="*80)
print("FEATURE MATRIX VALIDATION")
print("="*80)

# Infinite values
inf_values = np.isinf(X).sum().sum()
if inf_values > 0:
    print(f"\n⚠️  WARNING: {inf_values} infinite values!")
    inf_cols = X.columns[np.isinf(X).any()]
    print(f"Columns with inf values: {list(inf_cols)}")
else:
    print(f"\n✓ No infinite values")

# NaN values
nan_values = X.isnull().sum().sum()
if nan_values > 0:
    print(f"\n⚠️  WARNING: {nan_values} NaN values!")
else:
    print(f"✓ No NaN values")

# Constant features
constant_features = X.columns[X.std() == 0]
if len(constant_features) > 0:
    print(f"\n⚠️  WARNING: {len(constant_features)} constant features (zero variance):")
    print(list(constant_features[:20]))
    if len(constant_features) > 20:
        print(f"... and {len(constant_features) - 20} more")
else:
    print(f"✓ No constant features")

## 9. Target Variable Validation

In [None]:
print("="*80)
print("TARGET VARIABLE VALIDATION")
print("="*80)

# Classification target
print(f"\nClassification target:")
print(f"  High-impact (1): {y_class.sum()} ({y_class.mean()*100:.1f}%)")
print(f"  Low-impact (0): {(~y_class).sum()} ({(1-y_class.mean())*100:.1f}%)")

threshold = df['Citations'].quantile(0.75)
print(f"  Threshold: {threshold:.0f} citations (75th percentile)")

# Check alignment
expected_high = (df['Citations'] >= threshold).sum()
if y_class.sum() != expected_high:
    print(f"\n⚠️  WARNING: Target mismatch! Expected {expected_high}, got {y_class.sum()}")
else:
    print(f"\n✓ Target aligned with 75th percentile")

# Regression target
print(f"\nRegression target:")
print(y_reg.describe())

# Check if matches citations
if not y_reg.equals(df['Citations']):
    print(f"\n⚠️  WARNING: Regression target doesn't match Citations!")
else:
    print(f"\n✓ Regression target matches Citations column")

## 10. Temporal Split Validation

In [None]:
print("="*80)
print("TEMPORAL SPLIT VALIDATION")
print("="*80)

X_train = pd.read_pickle('../data/features/X_train_temporal.pkl')
X_test = pd.read_pickle('../data/features/X_test_temporal.pkl')

print(f"\nTemporal split:")
print(f"  Train (2015-2017): {len(X_train)}")
print(f"  Test (2018-2020): {len(X_test)}")
print(f"  Total: {len(X_train) + len(X_test)}")

# Check sum
if len(X_train) + len(X_test) == len(X):
    print(f"\n✓ Train + Test = Total")
else:
    print(f"\n⚠️  WARNING: Train + Test ≠ Total ({len(X_train)} + {len(X_test)} ≠ {len(X)})")

# Check overlap
train_indices = set(X_train.index)
test_indices = set(X_test.index)
overlap = train_indices & test_indices
if len(overlap) > 0:
    print(f"\n⚠️  WARNING: {len(overlap)} papers in both train and test!")
else:
    print(f"✓ No overlap between train and test")

## Summary

In [None]:
print("\n" + "="*80)
print("DATA QUALITY CHECK COMPLETE")
print("="*80)
print("\nReview the output above for any ⚠️  WARNING messages.")
print("All ✓ checks indicate the data is clean and ready for modeling.")