# Final Feature Engineering

Combine all features and create final dataset:
1. Load all feature sets
2. Combine features
3. Add metadata and target
4. Create train/test splits
5. Save final datasets

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)

## 1. Load All Features

In [None]:
feature_dir = Path('../data/features')

text_features = pd.read_pickle(feature_dir / 'text_features.pkl')
venue_features = pd.read_pickle(feature_dir / 'venue_features.pkl')
author_features = pd.read_pickle(feature_dir / 'author_features.pkl')

# Load title features if available
try:
    title_features = pd.read_pickle(feature_dir / 'title_features.pkl')
    has_title_features = True
except FileNotFoundError:
    title_features = None
    has_title_features = False

# Load additional features if available
try:
    additional_features = pd.read_pickle(feature_dir / 'additional_features.pkl')
    has_additional_features = True
except FileNotFoundError:
    additional_features = None
    has_additional_features = False

print(f"Text features (abstract): {text_features.shape}")
if has_title_features:
    print(f"Text features (title): {title_features.shape}")
print(f"Venue features: {venue_features.shape}")
print(f"Author features: {author_features.shape}")
if has_additional_features:
    print(f"Additional features: {additional_features.shape}")
    
if not has_title_features:
    print("Title features: Not found (run notebook 20b first)")
if not has_additional_features:
    print("Additional features: Not found (run notebook 22b first)")

## 2. Load Cleaned Data for Metadata

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Cleaned data: {df.shape}")

## 3. Combine All Features

In [None]:
# Combine all features
feature_list = [text_features, venue_features, author_features]

if title_features is not None:
    feature_list.append(title_features)
    
if additional_features is not None:
    feature_list.append(additional_features)

X = pd.concat(feature_list, axis=1)

print(f"Combined features shape: {X.shape}")
print(f"Total features: {X.shape[1]}")

## 4. Add Metadata and Targets

In [None]:
metadata = df[['EID', 'Title', 'Year', 'Scopus Source title']].copy()

y_regression = df['Citations'].copy()
y_regression_log = np.log1p(y_regression)

threshold = y_regression.quantile(0.75)
y_classification = (y_regression >= threshold).astype(int)

print(f"\nTarget statistics:")
print(f"Citations range: {y_regression.min()} - {y_regression.max()}")
print(f"Median citations: {y_regression.median():.0f}")
print(f"Top 25% threshold: {threshold:.0f}")
print(f"High-impact papers: {y_classification.sum()} ({y_classification.mean()*100:.1f}%)")

## 5. Create Temporal Train/Test Split

In [None]:
train_years = [2015, 2016, 2017]
test_years = [2018, 2019, 2020]

train_mask = df['Year'].isin(train_years)
test_mask = df['Year'].isin(test_years)

X_train_temporal = X[train_mask]
X_test_temporal = X[test_mask]
y_train_cls_temporal = y_classification[train_mask]
y_test_cls_temporal = y_classification[test_mask]
y_train_reg_temporal = y_regression_log[train_mask]
y_test_reg_temporal = y_regression_log[test_mask]
metadata_train = metadata[train_mask]
metadata_test = metadata[test_mask]

print(f"\nTemporal split:")
print(f"Train (2015-2017): {X_train_temporal.shape[0]} papers")
print(f"Test (2018-2020): {X_test_temporal.shape[0]} papers")
print(f"\nTrain high-impact: {y_train_cls_temporal.sum()} ({y_train_cls_temporal.mean()*100:.1f}%)")
print(f"Test high-impact: {y_test_cls_temporal.sum()} ({y_test_cls_temporal.mean()*100:.1f}%)")

## 6. Save All Datasets

In [None]:
output_dir = Path('../data/features')

X.to_pickle(output_dir / 'X_all.pkl')
y_classification.to_pickle(output_dir / 'y_classification.pkl')
y_regression.to_pickle(output_dir / 'y_regression.pkl')
y_regression_log.to_pickle(output_dir / 'y_regression_log.pkl')
metadata.to_pickle(output_dir / 'metadata.pkl')

X_train_temporal.to_pickle(output_dir / 'X_train_temporal.pkl')
X_test_temporal.to_pickle(output_dir / 'X_test_temporal.pkl')
y_train_cls_temporal.to_pickle(output_dir / 'y_train_cls_temporal.pkl')
y_test_cls_temporal.to_pickle(output_dir / 'y_test_cls_temporal.pkl')
y_train_reg_temporal.to_pickle(output_dir / 'y_train_reg_temporal.pkl')
y_test_reg_temporal.to_pickle(output_dir / 'y_test_reg_temporal.pkl')
metadata_train.to_pickle(output_dir / 'metadata_train.pkl')
metadata_test.to_pickle(output_dir / 'metadata_test.pkl')

print("All datasets saved to data/features/")

## Summary

In [None]:
print("=" * 60)
print("FINAL FEATURE ENGINEERING SUMMARY")
print("=" * 60)
print(f"Total papers: {len(X)}")
print(f"Total features: {X.shape[1]}")
print(f"  - Text features from abstracts: {text_features.shape[1]}")
if title_features is not None:
    print(f"  - Text features from titles: {title_features.shape[1]}")
print(f"  - Venue features: {venue_features.shape[1]}")
print(f"  - Author features: {author_features.shape[1]}")
if additional_features is not None:
    print(f"  - Additional features: {additional_features.shape[1]}")
print(f"\nTargets:")
print(f"  - Classification: Top {threshold:.0f} citations (25%)")
print(f"  - Regression: Log-transformed citation counts")
print(f"\nTemporal validation:")
print(f"  - Train: {X_train_temporal.shape[0]} papers (2015-2017)")
print(f"  - Test: {X_test_temporal.shape[0]} papers (2018-2020)")
print(f"\nReady for modeling!")