# Feature Engineering: Author Features

Extract author-related features:
1. Load cleaned data
2. Number of authors
3. Number of institutions
4. International collaboration
5. Save author features

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")

## 2. Basic Author Features

In [None]:
author_features = pd.DataFrame(index=df.index)

author_features['num_authors'] = df['Number of Authors']
author_features['num_institutions'] = df['Number of Institutions']
author_features['num_countries'] = df['Number of Countries/Regions']

print("Author feature statistics:")
print(author_features.describe())

## 3. Collaboration Features

In [None]:
author_features['is_single_author'] = (author_features['num_authors'] == 1).astype(int)
author_features['is_international_collab'] = (author_features['num_countries'] > 1).astype(int)
author_features['is_multi_institution'] = (author_features['num_institutions'] > 1).astype(int)

author_features['authors_per_institution'] = (
    author_features['num_authors'] / author_features['num_institutions'].replace(0, 1)
)

print(f"\nSingle author papers: {author_features['is_single_author'].sum()}")
print(f"International collaborations: {author_features['is_international_collab'].sum()}")
print(f"Multi-institution papers: {author_features['is_multi_institution'].sum()}")

## 4. Team Size Categories

In [None]:
author_features['team_size_small'] = (author_features['num_authors'] <= 3).astype(int)
author_features['team_size_medium'] = (
    (author_features['num_authors'] > 3) & (author_features['num_authors'] <= 10)
).astype(int)
author_features['team_size_large'] = (author_features['num_authors'] > 10).astype(int)

print("Team size distribution:")
print(f"Small (â‰¤3): {author_features['team_size_small'].sum()}")
print(f"Medium (4-10): {author_features['team_size_medium'].sum()}")
print(f"Large (>10): {author_features['team_size_large'].sum()}")

## 5. Handle Missing Values

In [None]:
print("Missing values before imputation:")
print(author_features.isnull().sum())

for col in author_features.columns:
    if author_features[col].dtype in ['float64', 'int64']:
        author_features[col] = author_features[col].fillna(author_features[col].median())

print("\nMissing values after imputation:")
print(author_features.isnull().sum().sum())

## 6. Save Features

In [None]:
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

author_features.to_pickle(output_dir / 'author_features.pkl')
print(f"Author features saved to: {output_dir / 'author_features.pkl'}")
print(f"Shape: {author_features.shape}")

## Summary

In [None]:
print("=" * 50)
print("AUTHOR FEATURES SUMMARY")
print("=" * 50)
print(f"Total papers: {len(author_features)}")
print(f"Author features: {author_features.shape[1]}")
print(f"\nFeature list:")
for col in author_features.columns:
    print(f"  - {col}")