<div align="center">
    
# 4.0 Feature Engineering

## 4.1 Table of Contents<a id='4.1_Table_of_Contents'></a>
* [4.1 Table of Contents](#4.1_Table_of_Contents)
* [4.2 Introduction](#4.2_Introduction)
* [4.3 Library Imports](#4.3_Library_Imports)
* [4.4 Data Loading](#4.4_Data_Loading)
* [4.5 Feature Preparation](#4.5_Feature_Preparation)
  * [4.5.1 Categorical Encoding](#4.5.1_Categorical_Encoding)
  * [4.5.2 Numerical Transformation](#4.5.2_Numerical_Transformation)
  * [4.5.3 Datetime Features](#4.5.3_Datetime_Features)
  * [4.5.4 Text Features](#4.5.4_Text_Features)
  * [4.5.5 Feature Creation](#4.5.5_Feature_Creation)
  * [4.5.6 Feature Reduction](#4.5.6_Feature_Reduction)
* [4.6 Transformer Pipeline](#4.6_Transformer_Pipeline)
* [4.7 Summary](#4.7_Summary)

## 4.2 Introduction<a id='4.2_Introduction'></a>

Create modeling-ready features from the cleaned dataset:
- Encode categoricals
- Scale/transform numerics
- Extract datetime parts
- Optional interactions
- Prune highly correlated features
- Build a reusable preprocessing **Pipeline**

## 4.3 Library Imports<a id='4.3_Library_Imports'></a>

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import joblib

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 140)

## 4.4 Data Loading <a id='4.4_Data_Loading'></a>

In [None]:
INPUT_PATH = "../data/processed/data_02_analyzed.csv"
df = pd.read_csv(INPUT_PATH)
print("Loaded:", INPUT_PATH, "| Shape:", df.shape)
df.head()

## 4.5 Feature Preparation<a id='4.5_Feature_Preparation'></a>

In [None]:
# Set your target if supervised (else leave None)
target = None  # e.g., "price"

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
dt_cols  = df.select_dtypes(include=['datetime', 'datetimetz']).columns.tolist()

print("Numeric:", len(num_cols), "| Categorical:", len(cat_cols), "| Datetime:", len(dt_cols))

# Optionally parse obvious date columns by name (if read from CSV as strings)
DATE_CANDIDATES = []  # e.g., ['date','order_date','created_at']
for c in DATE_CANDIDATES:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors='coerce')
        if c not in dt_cols:
            dt_cols.append(c)
        if c in cat_cols:
            cat_cols.remove(c)

print("After date parsing -> Datetime:", dt_cols)


### 4.5.1 Categorical Encoding<a id='4.5.1_Categorical_Encoding'></a>

In [None]:
# Use sparse=False for wider sklearn compatibility
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

cat_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

### 4.5.2 Numerical Transformation<a id='4.5.2_Numerical_Transformation'></a>

In [None]:
# Optional log transform helper (enable in pipeline if needed)
def safe_log1p(X: np.ndarray) -> np.ndarray:
    X = X.copy()
    return np.log1p(np.clip(X, a_min=0, a_max=None))  # guard negatives

num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    # ("log", FunctionTransformer(safe_log1p, feature_names_out="one-to-one")),
    ("scale", StandardScaler()),
    ("varthresh", VarianceThreshold(threshold=0.0))  # drop zero-variance
])

### 4.5.3 Datetime Features<a id='4.5.3_Datetime_Features'></a>

In [None]:
def extract_datetime_parts(df_in: pd.DataFrame, columns):
    df_out = df_in.copy()
    for c in columns:
        if c in df_out.columns:
            s = pd.to_datetime(df_out[c], errors="coerce")
            df_out[f"{c}_year"]  = s.dt.year
            df_out[f"{c}_month"] = s.dt.month
            df_out[f"{c}_day"]   = s.dt.day
            df_out[f"{c}_dow"]   = s.dt.dayofweek
            df_out[f"{c}_week"]  = s.dt.isocalendar().week.astype("Int64")
    return df_out

if len(dt_cols):
    df = extract_datetime_parts(df, dt_cols)
    for c in dt_cols:
        for suffix in ["year","month","day","dow","week"]:
            newc = f"{c}_{suffix}"
            if newc in df.columns and newc not in num_cols:
                num_cols.append(newc)

print("Datetime-derived numeric columns added.")

### 4.5.4 Text Features<a id='4.5.4_Text_Features'></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
TEXT_COLS = ['description']
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

### 4.5.5 Feature Creation<a id='4.5.5_Feature_Creation'></a>

In [None]:
# Define simple interactions you want to create: (colA, colB, new_name)
INTERACTIONS = [
    # ("quantity", "unit_price", "total_price")
]
for a, b, name in INTERACTIONS:
    if a in df.columns and b in df.columns:
        df[name] = df[a] * df[b]
        if name not in num_cols:
            num_cols.append(name)

print("Interactions added:", [n for _,_,n in INTERACTIONS])

### 4.5.6 Feature Reduction<a id='4.5.6_Feature_Reduction'></a>

In [None]:
def drop_high_corr(df_in: pd.DataFrame, cols, threshold=0.95):
    if not cols:
        return df_in, cols, []
    corr = df_in[cols].corr(numeric_only=True).abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
    kept = [c for c in cols if c not in to_drop]
    return df_in.drop(columns=to_drop, errors="ignore"), kept, to_drop

df, num_cols, dropped_corr = drop_high_corr(df, num_cols, threshold=0.95)
print("Dropped (high corr):", dropped_corr)

## 4.6 Transformer Pipeline<a id='4.6_Transformer_Pipeline'></a>

In [None]:
# Keep only columns that still exist
num_cols = [c for c in num_cols if c in df.columns]
cat_cols = [c for c in cat_cols if c in df.columns]

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols),
        # ("text", tfidf, TEXT_COLS)  # if using text features
    ],
    remainder="drop"
)

print("Numeric columns ->", len(num_cols))
print("Categorical columns ->", len(cat_cols))

# NOTE: Train/test split happens in the Modeling notebook to avoid leakage.

## 4.7 Summary<a id='4.7_Summary'></a>

In [None]:
# Save Featured Dataset
output_path = "../data/processed/data_03_featured.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"✅ Featured dataset saved -> {output_path} | Shape: {df.shape}")

In [None]:
# Save Preprocess Pipeline
pipeline_path = "../models/preprocess_pipeline.joblib"
os.makedirs(os.path.dirname(pipeline_path), exist_ok=True)
joblib.dump(preprocess, pipeline_path)
print(f"✅ Preprocess pipeline saved -> {pipeline_path}")