import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12,6)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Setup and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12,6)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
# Interaction features
# Vendor-Country combination (encode as categorical)
df['vendor_country_combo'] = df['Vendor'].astype(str) + '_' + df['Country'].astype(str)

# Shipment mode-Country combination
df['mode_country_combo'] = df['Shipment Mode'].astype(str) + '_' + df['Country'].astype(str)

# Vendor-Mode combination
df['vendor_mode_combo'] = df['Vendor'].astype(str) + '_' + df['Shipment Mode'].astype(str)


print("Interaction features created:")
print(f"- vendor_country_combo: {df['vendor_country_combo'].nunique()} unique values")
print(f"- mode_country_combo: {df['mode_country_combo'].nunique()} unique values")
print(f"- vendor_mode_combo: {df['vendor_mode_combo'].nunique()} unique values")

In [None]:
# Preview the data
df.head()

## 2. Additional Feature Engineering

In [None]:
# Interaction features
# Vendor-Country combination (encode as categorical)
df['vendor_country_combo'] = df['Vendor'].astype(str) + '_' + df['Country'].astype(str)

# Shipment mode-Country combination
df['mode_country_combo'] = df['Shipment Mode'].astype(str) + '_' + df['Country'].astype(str)

# Vendor-Mode combination
df['vendor_mode_combo'] = df['Vendor'].astype(str) + '_' + df['Shipment Mode'].astype(str)

print("Interaction features created:")
print(f"- vendor_country_combo: {df['vendor_country_combo'].nunique()} unique values")
print(f"- mode_country_combo: {df['mode_country_combo'].nunique()} unique values")
print(f"- vendor_mode_combo: {df['vendor_mode_combo'].nunique()} unique values")

In [None]:
# Ratio features
# Create weight to cost ratio (efficiency metric)
df['weight_to_cost_ratio'] = df['Weight (Kilograms)'] / (df['Freight Cost (USD)'] + 1)  # +1 to avoid division by zero

# Value density (value per weight)
df['value_density'] = df['Line Item Value'] / (df['Weight (Kilograms)'] + 0.1)

# Lead time to delivery ratio
df['lead_time_ratio'] = df['lead_time_days'] / (df['delay_days'].abs() + 1)

print("\nRatio features created")

In [None]:
# Boolean features
# High value shipment indicator
df['is_high_value'] = (df['Line Item Value'] > df['Line Item Value'].quantile(0.75)).astype(int)

# Heavy shipment indicator
df['is_heavy'] = (df['Weight (Kilograms)'] > df['Weight (Kilograms)'].quantile(0.75)).astype(int)

# Urgent shipment (air mode)
df['is_air_shipment'] = (df['Shipment Mode'] == 'Air').astype(int)

# Long lead time indicator
df['is_long_lead_time'] = (df['lead_time_days'] > 30).astype(int)

print("\nBoolean features created:")
print(f"- High value shipments: {df['is_high_value'].sum():,}")
print(f"- Heavy shipments: {df['is_heavy'].sum():,}")
print(f"- Air shipments: {df['is_air_shipment'].sum():,}")
print(f"- Long lead time: {df['is_long_lead_time'].sum():,}")

In [None]:
# Strategy for handling missing values
# For numerical columns: impute with median
# For categorical columns: create 'Unknown' category

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target and ID columns from imputation
numerical_cols = [col for col in numerical_cols if col not in ['is_late', 'delay_days', 'ID']]

# Impute numerical columns with median
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Imputed {col} with median: {median_val:.2f}")

# Impute categorical columns with 'Unknown'
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Unknown', inplace=True)
        print(f"Imputed {col} with 'Unknown'")


print(f"\nMissing values after imputation: {df.isnull().sum().sum()}")

## 3. Handle Missing Values

In [None]:
# Check missing values
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Count', ascending=False)

missing_summary = missing_summary[missing_summary['Missing_Count'] > 0]

print("Missing Values Summary:")
print("="*60)
if len(missing_summary) > 0:
    print(missing_summary.head(20))
else:
    print("No missing values found!")

In [None]:
# Strategy for handling missing values
# For numerical columns: impute with median
# For categorical columns: create 'Unknown' category

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target and ID columns from imputation
numerical_cols = [col for col in numerical_cols if col not in ['is_late', 'delay_days', 'ID']]

# Impute numerical columns with median
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Imputed {col} with median: {median_val:.2f}")

# Impute categorical columns with 'Unknown'
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Unknown', inplace=True)
        print(f"Imputed {col} with 'Unknown'")

print(f"\nMissing values after imputation: {df.isnull().sum().sum()}")

## 4. Feature Selection - Identify Relevant Features

In [None]:
# Define features to use for ML
# Exclude: ID columns, target, delay_days, date columns, raw text fields

exclude_cols = [
    'ID', 'is_late', 'delay_days',
    'PQ First Sent to Client Date', 'PO Sent to Vendor Date',
    'Scheduled Delivery Date', 'Delivered to Client Date', 'Delivery Recorded Date',
    'PQ #', 'PO / SO #', 'ASN/DN #',
    'Item Description', 'Molecule/Test Type', 'Brand',  # High cardinality text
    'delivery_year_month'  # Period object
]

# Get all columns
all_cols = df.columns.tolist()

# Feature columns (exclude target and non-predictive)
feature_cols = [col for col in all_cols if col not in exclude_cols]

print(f"Total columns: {len(all_cols)}")
print(f"Feature columns: {len(feature_cols)}")
print(f"Excluded columns: {len(exclude_cols)}")

# Separate numerical and categorical features
numerical_features = [col for col in feature_cols if df[col].dtype in [np.int64, np.float64]]
categorical_features = [col for col in feature_cols if df[col].dtype == 'object']

print(f"\nNumerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

## 5. Encode Categorical Variables

In [None]:
# Calculate correlation with target
# Add target back temporarily for correlation analysis
X_train_with_target = X_train.copy()
X_train_with_target['is_late'] = y_train.values

# Compute correlations with target
correlations = X_train_with_target.corr()['is_late'].drop('is_late').sort_values(ascending=False)

print("Top 20 Features Correlated with Delay:")
print("="*60)
print(correlations.head(20))

print("\nTop 20 Features Negatively Correlated with Delay:")
print("="*60)
print(correlations.tail(20))

In [None]:
# Visualze top correlations (minor typo)
plt.figure(figsize=(10, 8))

# Top 15 positive and negative correlations
top_corr = pd.concat([correlations.head(15), correlations.tail(15)])

plt.barh(range(len(top_corr)), top_corr.values)
plt.yticks(range(len(top_corr)), top_corr.index, fontsize=8)
plt.xlabel('Correlation with Delay')
plt.title('Top Features Correlated with Delivery Delays')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

In [None]:
# Create a copy for encoding
df_encoded = df.copy()

# One-hot encode low cardinality columns
if len(low_cardinality_cols) > 0:
    df_encoded = pd.get_dummies(df_encoded, columns=low_cardinality_cols, prefix=low_cardinality_cols, drop_first=True)
    print(f"One-hot encoding applied to {len(low_cardinality_cols)} columns")

print(f"Shape after one-hot encoding: {df_encoded.shape}")

In [None]:
# Target encode high cardinality columns
# Note: In production, fit on train set only to avoid leakage
# Here we'll do a simple train-test split first

# Create train/test split
train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=42, stratify=df_encoded['is_late'])

print(f"Train set: {train_df.shape}")
print(f"Test set: {test_df.shape}")
print(f"\nTrain delay rate: {train_df['is_late'].mean():.2%}")
print(f"Test delay rate: {test_df['is_late'].mean():.2%}")

In [None]:
# Apply target encoding to high cardinality columns
if len(high_cardinality_cols) > 0:
    # Fit target encoder on training data
    target_encoder = TargetEncoder(cols=high_cardinality_cols)
    
    # Fit on train
    train_df[high_cardinality_cols] = target_encoder.fit_transform(
        train_df[high_cardinality_cols], 
        train_df['is_late']
    )
    
    # Transform test
    test_df[high_cardinality_cols] = target_encoder.transform(
        test_df[high_cardinality_cols]
    )
    
    print(f"\nTarget encoding applied to {len(high_cardinality_cols)} columns")
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")

## 6. Prepare Final Feature Matrix

In [None]:
# Separate features and target
# Get feature column names (excluding target and metadata)
final_exclude = ['is_late', 'delay_days', 'ID'] + [
    col for col in train_df.columns if 'Date' in col or col in ['PQ #', 'PO / SO #', 'ASN/DN #', 'delivery_year_month']
]

feature_columns = [col for col in train_df.columns if col not in final_exclude]

# Create feature matrices
X_train = train_df[feature_columns]
y_train = train_df['is_late']

X_test = test_df[feature_columns]
y_test = test_df['is_late']

print(f"Final feature matrix:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"\nTotal features: {X_train.shape[1]}")

In [None]:
# Check for any remaining missing values or infinites
print("Data quality check:")
print(f"Missing values in X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test: {X_test.isnull().sum().sum()}")
print(f"Infinite values in X_train: {np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum()}")
print(f"Infinite values in X_test: {np.isinf(X_test.select_dtypes(include=[np.number])).sum().sum()}")

# Replace any infinites with NaN, then fill with 0
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

print("\nAfter cleanup:")
print(f"Missing values in X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test: {X_test.isnull().sum().sum()}")

## 7. Scale Numerical Features

In [None]:
# Identify numerical columns in final feature matrix
numerical_feature_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print(f"Numerical features to scale: {len(numerical_feature_cols)}")

# Initialize scaler
scaler = StandardScaler()

# Fit on train, transform both
X_train[numerical_feature_cols] = scaler.fit_transform(X_train[numerical_feature_cols])
X_test[numerical_feature_cols] = scaler.transform(X_test[numerical_feature_cols])

print("\nFeatures scaled using StandardScaler")
print(f"Mean of scaled features (should be ~0): {X_train[numerical_feature_cols].mean().mean():.6f}")
print(f"Std of scaled features (should be ~1): {X_train[numerical_feature_cols].std().mean():.6f}")

## 8. Feature Correlation Analysis

In [None]:
# Calculate correlation with target
# Add target back temporarily for correlation analysis
X_train_with_target = X_train.copy()
X_train_with_target['is_late'] = y_train.values

# Compute correlations with target
correlations = X_train_with_target.corr()['is_late'].drop('is_late').sort_values(ascending=False)

print("Top 20 Features Correlated with Delay:")
print("="*60)
print(correlations.head(20))

print("\nTop 20 Features Negatively Correlated with Delay:")
print("="*60)
print(correlations.tail(20))

In [None]:
# Visualize top correlations
plt.figure(figsize=(10, 8))

# Top 15 positive and negative correlations
top_corr = pd.concat([correlations.head(15), correlations.tail(15)])

plt.barh(range(len(top_corr)), top_corr.values)
plt.yticks(range(len(top_corr)), top_corr.index, fontsize=8)
plt.xlabel('Correlation with Delay')
plt.title('Top Features Correlated with Delivery Delays')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

In [None]:
# Check for highly correlated features (multicollinearity)
# Calculate correlation matrix for numerical features only
corr_matrix = X_train[numerical_feature_cols].corr().abs()

# Find pairs with correlation > 0.9
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.9:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

print(f"\nHighly correlated feature pairs (>0.9): {len(high_corr_pairs)}")
if len(high_corr_pairs) > 0:
    print("\nTop 10 highly correlated pairs:")
    for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: x[2], reverse=True)[:10]:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")
else:
    print("No highly correlated pairs found")

## 9. Quick Feature Importance (Simple Model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Train a simple random forest for feature importance
print("Training quick Random Forest for feature importance...")
rf_quick = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_quick.fit(X_train, y_train)

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_quick.feature_importances_
}).sort_values('importance', ascending=False)

# Evaluate quick model
y_pred_proba = rf_quick.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nQuick RF Model ROC-AUC: {auc:.4f}")
print(f"\nTop 20 Important Features:")
print("="*60)
print(feature_importance.head(20))

In [None]:
# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'].values)
plt.yticks(range(len(top_features)), top_features['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features by Random Forest Importance')
plt.tight_layout()
plt.show()

## 10. Save Processed Data

In [None]:
# Save train and test sets
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False, header=True)
y_test.to_csv('../data/processed/y_test.csv', index=False, header=True)

print("Processed data saved:")
print("  - ../data/processed/X_train.csv")
print("  - ../data/processed/X_test.csv")
print("  - ../data/processed/y_train.csv")
print("  - ../data/processed/y_test.csv")

In [None]:
# Save feature names for reference
feature_names_df = pd.DataFrame({
    'feature_name': X_train.columns,
    'feature_type': X_train.dtypes.astype(str)
})
feature_names_df.to_csv('../data/processed/feature_names.csv', index=False)

print("\nFeature names saved to: ../data/processed/feature_names.csv")

## 11. Summary

In [None]:
print("="*80)
print("SUMMARY - FEATURE ENGINEERING")
print("="*80)

print(f"\n1. Input Data:")
print(f"   - Rows: {df.shape[0]:,}")
print(f"   - Original columns: {df.shape[1]}")

print(f"\n2. Features Created:")
print(f"   - Interaction features: 3 (vendor_country, mode_country, vendor_mode)")
print(f"   - Ratio features: 3 (weight_to_cost, value_density, lead_time_ratio)")
print(f"   - Boolean features: 4 (high_value, heavy, air, long_lead_time)")
print(f"   - Time features: 2 (weekend, season)")

print(f"\n3. Encoding:")
print(f"   - One-hot encoded: {len(low_cardinality_cols)} low-cardinality columns")
print(f"   - Target encoded: {len(high_cardinality_cols)} high-cardinality columns")

print(f"\n4. Final Dataset:")
print(f"   - Training samples: {X_train.shape[0]:,}")
print(f"   - Test samples: {X_test.shape[0]:,}")
print(f"   - Total features: {X_train.shape[1]}")
print(f"   - Numerical features scaled: {len(numerical_feature_cols)}")

print(f"\n5. Data Quality:")
print(f"   - Missing values: 0")
print(f"   - Infinite values: 0")
print(f"   - Target balance - Train: {y_train.mean():.2%}")
print(f"   - Target balance - Test: {y_test.mean():.2%}")

print(f"\n6. Quick Model Performance:")
print(f"   - Random Forest ROC-AUC: {auc:.4f}")

print(f"\n7. Top 5 Most Important Features:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"   - {row['feature']}: {row['importance']:.4f}")

print("\n8. Next Steps:")
print("   - Notebook 4: Baseline Models (Logistic Regression, Decision Tree)")
print("   - Notebook 5: Advanced Models (Random Forest, XGBoost, LightGBM)")
print("   - Notebook 6: Model Evaluation and Selection")
print("="*80)