# NBA Fantasy Feature Engineering

This notebook implements the feature engineering methodology from Papageorgiou et al. (2024), creating features for NBA player performance prediction.

## Setup and Data Loading

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from datetime import datetime

from src.data.processor import DataProcessor
from src.features.builder import FeatureBuilder

# Set plotting style
plt.style.use('seaborn')
%matplotlib inline

In [None]:
# Load raw data
raw_games = pd.read_csv('../data/raw/all_games.csv')
player_info = pd.read_csv('../data/raw/player_info.csv')

# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print(f"Loaded {len(raw_games)} game records for {len(player_info)} players")

## Initial Data Processing

Before feature engineering, we need to process the raw data and calculate fantasy points.

In [None]:
processor = DataProcessor()

# Calculate fantasy points
games_df = processor.calculate_fantasy_points(raw_games)

# Sort by player and date
games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])
games_df = games_df.sort_values(['PLAYER_ID', 'GAME_DATE'])

# Show fantasy points distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=games_df, x='FANTASY_POINTS')
plt.title('Distribution of Fantasy Points')
plt.show()

## Feature Engineering

Following the paper's methodology, we'll create three types of features:
1. Lag features (previous game performance)
2. Rolling statistics (moving averages)
3. Momentum indicators

In [None]:
builder = FeatureBuilder()

# Create lag features
lag_columns = ['FANTASY_POINTS', 'PTS', 'REB', 'AST', 'MIN']
lag_periods = [1, 3, 5, 7, 10]

features_df = builder.create_lag_features(
    games_df,
    columns=lag_columns,
    lags=lag_periods
)

print("Created lag features:")
print([col for col in features_df.columns if 'lag' in col][:5])

In [None]:
# Create rolling features
features_df = builder.create_rolling_features(
    features_df,
    columns=['FANTASY_POINTS', 'PTS', 'REB', 'AST'],
    windows=[3, 5, 7]
)

print("\nCreated rolling features:")
print([col for col in features_df.columns if 'rolling' in col][:5])

In [None]:
# Create momentum features
features_df = builder.create_momentum_features(features_df)

print("\nCreated momentum features:")
print([col for col in features_df.columns if 'momentum' in col])

## Feature Analysis

Let's analyze the relationships between our engineered features and fantasy points.

In [None]:
def plot_feature_correlations(df, target='FANTASY_POINTS', n_features=10):
    """Plot top feature correlations with target."""
    correlations = df.corr()[target].sort_values(ascending=False)
    correlations = correlations[1:n_features+1]  # Exclude target itself
    
    plt.figure(figsize=(12, 6))
    correlations.plot(kind='bar')
    plt.title(f'Top {n_features} Feature Correlations with {target}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

plot_feature_correlations(features_df)

## Feature Selection

Remove highly correlated features to prevent multicollinearity.

In [None]:
def remove_correlated_features(df, threshold=0.95):
    """Remove highly correlated features."""
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    print(f"Removing {len(to_drop)} highly correlated features")
    return df.drop(columns=to_drop)

# Remove correlated features
features_df = remove_correlated_features(features_df)
print(f"Final feature set shape: {features_df.shape}")

## Feature Importance Analysis

Use Random Forest to analyze feature importance.

In [None]:
from sklearn.ensemble import RandomForestRegressor

def plot_feature_importance(df, target='FANTASY_POINTS', n_features=15):
    """Plot feature importance using Random Forest."""
    # Prepare data
    X = df.drop(columns=[target]).select_dtypes(include=[np.number])
    y = df[target]
    
    # Train Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot
    plt.figure(figsize=(12, 6))
    plt.bar(importance['feature'][:n_features], importance['importance'][:n_features])
    plt.title(f'Top {n_features} Most Important Features')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
plot_feature_importance(features_df)

## Save Processed Features

In [None]:
# Save processed features
features_df.to_csv('../data/processed/features.csv', index=False)

# Save feature metadata
feature_metadata = pd.DataFrame({
    'feature': features_df.columns,
    'type': ['lag' if 'lag' in col else 'rolling' if 'rolling' in col 
             else 'momentum' if 'momentum' in col else 'basic'
             for col in features_df.columns]
})
feature_metadata.to_csv('../data/processed/feature_metadata.csv', index=False)

print("Features saved successfully!")

## Feature Summary

1. Total features created: {features_df.shape[1]}
2. Types of features:
   - Lag features: {sum('lag' in col for col in features_df.columns)}
   - Rolling features: {sum('rolling' in col for col in features_df.columns)}
   - Momentum features: {sum('momentum' in col for col in features_df.columns)}
   
Next steps:
1. Model training using these engineered features
2. Feature selection during model development
3. Performance evaluation with different feature sets