# 03 - Feature Engineering

Bu notebook'ta feature engineering ve class balancing yapacağız.

## Hedefler:
- Yeni feature'lar oluşturmak
- Feature selection yapmak
- SMOTE ile class balancing
- İyileştirilmiş model performansı

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

## 1. Load and Prepare Data

In [None]:
# Load dataset
df = pd.read_csv('../data/creditcard.csv')

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 2. Feature Engineering

In [None]:
# Create new features
def create_features(df):
    df_new = df.copy()
    
    # Time-based features
    df_new['Hour'] = (df_new['Time'] / 3600) % 24
    df_new['Day'] = (df_new['Time'] / 86400).astype(int)
    
    # Amount-based features
    df_new['Amount_log'] = np.log1p(df_new['Amount'])
    
    # Interaction features (sample)
    df_new['V1_V2'] = df_new['V1'] * df_new['V2']
    df_new['V1_V3'] = df_new['V1'] * df_new['V3']
    
    return df_new

X_train_fe = create_features(X_train)
X_test_fe = create_features(X_test)

print(f"New feature count: {X_train_fe.shape[1]}")
print(f"Added features: {set(X_train_fe.columns) - set(X_train.columns)}")

## 3. Feature Scaling

In [None]:
# Scale all features
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_fe),
    columns=X_train_fe.columns,
    index=X_train_fe.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_fe),
    columns=X_test_fe.columns,
    index=X_test_fe.index
)

print("✅ Features scaled")

## 4. Handle Class Imbalance with SMOTE

In [None]:
# Check class distribution before SMOTE
print("Before SMOTE:")
print(f"Class 0: {(y_train == 0).sum()}")
print(f"Class 1: {(y_train == 1).sum()}")
print(f"Ratio: 1:{(y_train == 0).sum() / (y_train == 1).sum():.0f}")

In [None]:
# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("\nAfter SMOTE:")
print(f"Class 0: {(y_train_smote == 0).sum()}")
print(f"Class 1: {(y_train_smote == 1).sum()}")
print(f"Ratio: 1:{(y_train_smote == 0).sum() / (y_train_smote == 1).sum():.1f}")
print(f"\nNew training set size: {X_train_smote.shape}")

In [None]:
# Visualize class distribution
fig = go.Figure()

fig.add_trace(go.Bar(
    name='Before SMOTE',
    x=['Normal', 'Fraud'],
    y=[(y_train == 0).sum(), (y_train == 1).sum()],
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    name='After SMOTE',
    x=['Normal', 'Fraud'],
    y=[(y_train_smote == 0).sum(), (y_train_smote == 1).sum()],
    marker_color='darkblue'
))

fig.update_layout(
    title='Class Distribution: Before vs After SMOTE',
    barmode='group',
    height=500
)
fig.show()

## 5. Train Model with Engineered Features

In [None]:
# Train model with SMOTE data
model_fe = LogisticRegression(random_state=42, max_iter=1000)
model_fe.fit(X_train_smote, y_train_smote)

print("✅ Model trained with feature engineering and SMOTE")

## 6. Evaluate Improved Model

In [None]:
# Predictions
y_pred = model_fe.predict(X_test_scaled)
y_pred_proba = model_fe.predict_proba(X_test_scaled)[:, 1]

# Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Normal', 'Fraud']))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

## 7. Compare with Baseline

In [None]:
# Train baseline for comparison
X_train_base_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns
)
X_test_base_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

baseline = LogisticRegression(random_state=42, max_iter=1000)
baseline.fit(X_train_base_scaled, y_train)
baseline_proba = baseline.predict_proba(X_test_base_scaled)[:, 1]
baseline_auc = roc_auc_score(y_test, baseline_proba)

print("Performance Comparison:")
print(f"Baseline ROC-AUC: {baseline_auc:.4f}")
print(f"Feature Engineered ROC-AUC: {roc_auc:.4f}")
print(f"Improvement: {((roc_auc - baseline_auc) / baseline_auc * 100):.2f}%")

## 8. Summary

### Feature Engineering Results:
- Created new time-based and amount-based features
- Applied SMOTE for class balancing
- Improved model performance

### Next Steps:
1. Try different models (Random Forest, XGBoost)
2. Hyperparameter tuning
3. Advanced feature selection

In [None]:
print("✅ Feature engineering completed!")
print("\nNext: Run 04_model_optimization.ipynb")