In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 02: Feature Engineering

We applied several transformations to enhance the quality of our dataset for predictive modeling:

- **Temporal features** (`days_since_start`, `week_of_year`, `vax_phase`) to capture time-driven effects.
- **Log transformation** of skewed numeric columns to stabilize variance and help models learn better.
- **Correlation filtering**: Dropped numeric features with >0.9 correlation to reduce multicollinearity and improve generalizability.

This pipeline ensures our models can learn cleanly from independent signals without redundancy or noise.

In [19]:
# Load contextual-imputed datasets (targets filled by XGBoost proxy models)
target_variables = ['smoothed_wtested_positive_14d', 'smoothed_wcovid_vaccinated']

data_paths = {
    'smoothed_wtested_positive_14d': 'data/imputed_contextual_xgb_smoothed_wtested_positive_14d.csv',
    'smoothed_wcovid_vaccinated': 'data/imputed_contextual_xgb_smoothed_wcovid_vaccinated.csv'
}

datasets = {target: pd.read_csv(path) for target, path in data_paths.items()}

In [21]:
# Step 1: Add Temporal Features
def add_temporal_features(df):
    df = df.copy()
    df['time_value'] = pd.to_datetime(df['time_value'])
    df['days_since_start'] = (df['time_value'] - df['time_value'].min()).dt.days
    df['week_of_year'] = df['time_value'].dt.isocalendar().week
    df['vax_phase'] = (df['time_value'] >= '2021-01-15').astype(int)
    return df

for target in target_variables:
    datasets[target] = add_temporal_features(datasets[target])

In [23]:
# Step 2: Log-Transform Skewed Features
def transform_skewed_features(df, threshold=1):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[float, int]).columns
    skewed = df[numeric_cols].skew().loc[lambda x: abs(x) > threshold].index.tolist()
    for col in skewed:
        if col not in ['is_pseudo']:
            df[f"{col}_log"] = np.log1p(df[col])
    return df

for target in target_variables:
    datasets[target] = transform_skewed_features(datasets[target])

In [25]:
# Step 3: Drop Highly Correlated Features
def drop_high_corr_features(df, threshold=0.9):
    corr = df.select_dtypes(include=[float, int]).corr()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col].abs() > threshold)]
    return df.drop(columns=to_drop), to_drop

dropped_features = {}
for target in target_variables:
    df = datasets[target]
    df, to_drop = drop_high_corr_features(df)
    dropped_features[target] = to_drop
    datasets[target] = df

In [27]:
# Step 4: Finalize X and y
final_data = {}
for target in target_variables:
    df = datasets[target]
    drop_cols = ['geo_value', 'state', 'time_value', 'is_pseudo', target]
    X = df.drop(columns=[col for col in drop_cols if col in df.columns])
    y = df[target]
    final_data[target] = (X, y)

In [31]:
# Step 5: Save Feature-Engineered Datasets
for target in target_variables:
    output_path = f"data/final_X_y_{target}.csv"
    datasets[target].to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: data/final_X_y_smoothed_wtested_positive_14d.csv
Saved: data/final_X_y_smoothed_wcovid_vaccinated.csv
