# Feature Engineering: Additional Metadata Features

Extract additional features:
1. Open Access status
2. Publication type
3. Topic Prominence
4. Source type

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)

## Load Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")

## 1. Open Access Feature

In [None]:
additional_features = pd.DataFrame(index=df.index)

# Binary: 1 if open access, 0 otherwise
additional_features['is_open_access'] = df['Open Access'].notna().astype(int)

print(f"Open Access papers: {additional_features['is_open_access'].sum()}")
print(f"Percentage: {additional_features['is_open_access'].mean()*100:.1f}%")

## 2. Publication Type Features

In [None]:
print("Publication type distribution:")
print(df['Publication type'].value_counts())

# One-hot encode publication type
pub_type_dummies = pd.get_dummies(df['Publication type'], prefix='pubtype', dummy_na=False)

print(f"\nPublication type features created: {pub_type_dummies.shape[1]}")
print(f"Columns: {list(pub_type_dummies.columns)}")

## 3. Topic Prominence Feature

In [None]:
# Convert to numeric and fill missing with median
additional_features['topic_prominence'] = pd.to_numeric(
    df['Topic Prominence Percentile'], 
    errors='coerce'
)

print(f"Topic Prominence statistics:")
print(additional_features['topic_prominence'].describe())

print(f"\nMissing values: {additional_features['topic_prominence'].isna().sum()}")

# Fill missing with median
median_prominence = additional_features['topic_prominence'].median()
additional_features['topic_prominence'].fillna(median_prominence, inplace=True)

print(f"After imputation - Missing: {additional_features['topic_prominence'].isna().sum()}")

## 4. Source Type Features

In [None]:
print("Source type distribution:")
print(df['Source type'].value_counts())

# One-hot encode source type
source_type_dummies = pd.get_dummies(df['Source type'], prefix='sourcetype', dummy_na=False)

print(f"\nSource type features created: {source_type_dummies.shape[1]}")
print(f"Columns: {list(source_type_dummies.columns)}")

## 5. Combine All Additional Features

In [None]:
# Combine all additional features
additional_features = pd.concat([
    additional_features,
    pub_type_dummies,
    source_type_dummies
], axis=1)

print(f"Total additional features: {additional_features.shape[1]}")
print(f"\nFeature columns:")
for col in additional_features.columns:
    print(f"  - {col}")

## 6. Save Features

In [None]:
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

additional_features.to_pickle(output_dir / 'additional_features.pkl')
print(f"✓ Additional features saved to: {output_dir / 'additional_features.pkl'}")
print(f"Shape: {additional_features.shape}")

## Summary

In [None]:
print("="*60)
print("ADDITIONAL FEATURES SUMMARY")
print("="*60)
print(f"Total papers: {len(additional_features)}")
print(f"Additional features: {additional_features.shape[1]}")
print(f"\nFeature breakdown:")
print(f"  - Open Access: 1 feature")
print(f"  - Topic Prominence: 1 feature")
print(f"  - Publication Type: {len(pub_type_dummies.columns)} features")
print(f"  - Source Type: {len(source_type_dummies.columns)} features")
print(f"\n✓ Ready to combine with existing features!")