# Feature Engineering: Venue Features

Extract venue prestige metrics:
1. Load cleaned data
2. Parse venue metrics (SNIP, SJR, CiteScore)
3. Create venue features
4. Save venue features

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")

## 2. Parse Venue Metrics

In [None]:
def safe_float(val):
    try:
        return float(val)
    except:
        return np.nan

venue_features = pd.DataFrame(index=df.index)

venue_features['snip'] = df['SNIP (publication year)'].apply(safe_float)
venue_features['snip_percentile'] = df['SNIP percentile (publication year) *'].apply(safe_float)
venue_features['citescore'] = df['CiteScore (publication year)'].apply(safe_float)
venue_features['citescore_percentile'] = df['CiteScore percentile (publication year) *'].apply(safe_float)
venue_features['sjr'] = df['SJR (publication year)'].apply(safe_float)
venue_features['sjr_percentile'] = df['SJR percentile (publication year) *'].apply(safe_float)

print("Venue metric statistics:")
print(venue_features.describe())

## 3. Create Additional Venue Features

In [None]:
venue_features['avg_venue_percentile'] = venue_features[[
    'snip_percentile', 'citescore_percentile', 'sjr_percentile'
]].mean(axis=1)

venue_features['is_top_journal'] = (
    (venue_features['snip_percentile'] >= 90) |
    (venue_features['citescore_percentile'] >= 90) |
    (venue_features['sjr_percentile'] >= 90)
).astype(int)

venue_features['venue_score_composite'] = (
    venue_features['snip'] * 0.33 +
    venue_features['citescore'] * 0.33 +
    venue_features['sjr'] * 0.34
)

print(f"\nTop journals: {venue_features['is_top_journal'].sum()}")
print(f"Average venue percentile: {venue_features['avg_venue_percentile'].mean():.2f}")

## 4. Add Field-Weighted View Metrics

**LEAKY FEATURES REMOVED**: field_weighted_citation_impact, field_citation_average, top_citation_percentile

In [None]:
venue_features['field_weighted_view_impact'] = df['Field-Weighted View Impact']
venue_features['views'] = df['Views']

print("\nField-weighted VIEW metrics added (citation-based metrics REMOVED)")
print(venue_features[['field_weighted_view_impact', 'views']].describe())

## 5. Handle Missing Values

In [None]:
print("Missing values before imputation:")
print(venue_features.isnull().sum())

for col in venue_features.columns:
    if venue_features[col].dtype in ['float64', 'int64']:
        venue_features[col] = venue_features[col].fillna(venue_features[col].median())

print("\nMissing values after imputation:")
print(venue_features.isnull().sum().sum())

## 6. Save Features

In [None]:
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

venue_features.to_pickle(output_dir / 'venue_features.pkl')
print(f"Venue features saved to: {output_dir / 'venue_features.pkl'}")
print(f"Shape: {venue_features.shape}")

## Summary

In [None]:
print("=" * 50)
print("VENUE FEATURES SUMMARY")
print("=" * 50)
print(f"Total papers: {len(venue_features)}")
print(f"Venue features: {venue_features.shape[1]} (should be 11, not 14)")
print(f"\nFeature list:")
for col in venue_features.columns:
    print(f"  - {col}")