#### **1. Import Libraries**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Core libraries
import warnings
warnings.filterwarnings('ignore')

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix


# Statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

 **2. Data ingestion**

In [None]:
url = ''
df = pd.read_csv(url)
df.head()
df.tail()

 **2. Preliminary Descriptive Analysis(PDA)**

In [None]:

print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
df.info()
df.dtypes
df.duplicated().sum(axis=0)
df = df.drop_duplicates()
df.describe().T
df.quality.unique()

df['quality_label'] = df['quality'].map({3:'worst',4:'bad',5:'good',6:'good',7:'best',8:'best',9:'best'})
for column in df:
    print(f"\nSkewness: {df[column].skew():.3f}")
    print(f"Kurtosis: {df[column].kurt():.3f}")

  # Frequency + proportion
    freq = df[column].value_counts()
    prop = df[column].value_counts(normalize=True) * 100
    summary = pd.DataFrame({'Count': freq, 'Percentage': prop.round(2)})
    print(summary)

#### **3. Log-Transform Skewed Variables (EDA Recommendation)**

In [None]:
# Log-transform skewed variables as recommended by EDA
print("=== LOG-TRANSFORMING SKEWED VARIABLES ===")
print("EDA identified these variables as right-skewed and recommended log transformation:")

# Variables to log-transform based on EDA findings
skewed_vars = ['x', 'y', 'z']

for var in skewed_vars:
    if var in df.columns:
        # Check if variable has zero or negative values
        min_val = df[var].min()
        if min_val <= 0:
            # Use log1p for variables with zeros
            df[f'{var}_log'] = np.log1p(df[var])
            print(f"✓ {var}: Applied log1p transformation (had {min_val:.3f} minimum value)")
        else:
            # Use log for positive values only
            df[f'{var}_log'] = np.log(df[var])
            print(f"✓ {var}: Applied log transformation")
        
        # Check skewness before and after
        original_skew = skew(df[var])
        transformed_skew = skew(df[f'{var}_log'])
        print(f"  Original skewness: {original_skew:.3f} → Transformed skewness: {transformed_skew:.3f}")

print(f"\nDataset shape after log transformation: {df.shape}")
print("New log-transformed columns:", [col for col in df.columns if '_log' in col])


#### **4. Outlier Treatment (EDA Recommendation)**

In [None]:
# Outlier treatment based on EDA recommendations
print("=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===")
print("EDA recommended IQR-capping method for outlier treatment.\n")

# Define numerical columns (excluding target)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'quality' in numerical_cols:
    numerical_cols.remove('quality')

print(f"Treating outliers in {len(numerical_cols)} numerical features...")

# Apply IQR-capping method
outliers_capped = 0
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers before capping
    outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    
    if outliers_before > 0:
        # Cap outliers
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
        outliers_capped += outliers_before
        print(f"✓ {col}: Capped {outliers_before} outliers")

print(f"\nTotal outliers capped: {outliers_capped}")
print(f"Dataset shape after outlier treatment: {df.shape}")

### 5. Encoding(like label encoding and one-hot encoding)

In [None]:
label_map = {
        'has_job_experience': {'Y': 1, 'N': 0},
        'requires_job_training': {'Y': 1, 'N': 0},
        'full_time_position': {'Y': 1, 'N': 0},
        'case_status': {'Certified': 1, 'Denied': 0}
    }

for col, mapping in label_map.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    # One-hot Encoding (multi-category columns)
        onehot_cols = [
        'continent',
        'education_of_employee',
        'region_of_employment',
        'unit_of_wage'
    ]

df = pd.get_dummies(df, columns=onehot_cols, drop_first=False, dtype=int)
df

print("Transformation (encoding) complete.")

### 6. Data Splitting

In [None]:
print("=== STRATIFIED DATA SPLITTING ===")
print("EDA identified class imbalance - using stratified splitting to preserve class distribution")

# Split features and target
# 1. Define the feature variables (X) and target variable (y)
X = df.drop('Risk', axis=1)  # Features (drop the target column)
Y = df['Risk']  # Target (Risk)

X = pd.get_dummies(X, drop_first=True)  ## Complete the code to create dummies for X


print(f"Selected features shape: {X.shape}")
print(f"Target variable: '{Y}'")
print(f"Unique class distribution:\n{y.value_counts(normalize=True).round(3)}")

# ---------- First Split (80% train+val, 20% test) ----------
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------- Second Split (75% train, 25% val of 80%) ----------
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

# ---------- Summary ----------
print("\n=== DATA SPLIT RESULTS ===")
print(f"Training set:   {X_train.shape} ({(X_train.shape[0] / len(X)) * 100:.1f}%)")
print(f"Validation set: {X_val.shape} ({(X_val.shape[0] / len(X)) * 100:.1f}%)")
print(f"Test set:       {X_test.shape} ({(X_test.shape[0] / len(X)) * 100:.1f}%)")

# ---------- Class Distribution Verification ----------
print("\n=== CLASS DISTRIBUTION CHECK ===")
print("Training set case_status distribution:")
print(y_train.value_counts(normalize=True).round(3))
print("\nValidation set case_status distribution:")
print(y_val.value_counts(normalize=True).round(3))
print("\nTest set case_status distribution:")
print(y_test.value_counts(normalize=True).round(3))


### 7. Feature Scaling (Normalization / Standardization)

In [None]:
print("=== FEATURE SCALING (STANDARD SCALER) ===")
print("EDA recommended StandardScaler for distance-based models\n")

# Replace NaN or inf values before scaling
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(X_train.median())
X_val = X_val.replace([np.inf, -np.inf], np.nan).fillna(X_val.median())
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_test.median())

# Initialize scaler and fit on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Report summary stats
print("✓ Scaling applied successfully!")
print(f"Training set scaled - Mean: {X_train_scaled.mean().mean():.4f}, Std: {X_train_scaled.std().mean():.4f}")
print(f"Validation set scaled - Mean: {X_val_scaled.mean().mean():.4f}, Std: {X_val_scaled.std().mean():.4f}")
print(f"Test set scaled - Mean: {X_test_scaled.mean().mean():.4f}, Std: {X_test_scaled.std().mean():.4f}")

# Verify scaling behavior
print(f"\nScaling verification:")
print(f"Training set - Mean ≈ 0: {abs(X_train_scaled.mean().mean()) < 0.01}")
print(f"Training set - Std ≈ 1: {abs(X_train_scaled.std().mean() - 1) < 0.01}")

print("\nFeature scaling complete.\n")

### 8. Save Preprocessed Data

In [None]:
print("SAVING PREPROCESSED DATA ")

# Save scaled datasets
X_train_scaled.to_csv('X_train_scaled.csv', index=False)
X_val_scaled.to_csv('X_val_scaled.csv', index=False)
X_test_scaled.to_csv('X_test_scaled.csv', index=False)

# Save target variables
y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Save preprocessing objects
import joblib
joblib.dump(scaler, 'scaler.pkl')


# Save preprocessing summary
preprocessing_summary = {
    'original_shape': df.shape,
    'final_shape': df.shape,
    'train_samples': X_train_scaled.shape[0],
    'val_samples': X_val_scaled.shape[0],
    'test_samples': X_test_scaled.shape[0],
    'scaling_method': 'StandardScaler',
    'outlier_treatment': 'IQR_capping',
    'log_transformed': ['x_log', 'y_log', 'z_log']
}

import json
with open('preprocessing_summary.json', 'w') as f:
    json.dump(preprocessing_summary, f, indent=2)

print("- Preprocessed data saved successfully!")
print("\nFiles created:")
print("- X_train_scaled.csv, X_val_scaled.csv, X_test_scaled.csv")
print("- y_train.csv, y_val.csv, y_test.csv")
print("- scaler.pkl")
print("- preprocessing_summary.json")


#### **9. Preprocessing Summary**

In [None]:
# Final preprocessing summary
print(".....PREPROCESSING SUMMARY BASED ON EDA INSIGHTS.....")
print(f"Original dataset shape: {df.shape}")
print(f"Final processed dataset shape: {df.shape}")
print(f"Training samples: {X_train_scaled.shape[0]}")
print(f"Validation samples: {X_val_scaled.shape[0]}")
print(f"Test samples: {X_test_scaled.shape[0]}")

print("\n.....PREPROCESSING STEPS COMPLETED (EDA-BASED).....")
print("> Data quality assessment (no missing values, duplicates handled)")
print("> Log-transformation of skewed variables (x, y, z)")
print("> Outlier treatment using IQR-capping method")
print("> Stratified data splitting (preserves class distribution)")
print("> StandardScaler applied (EDA recommendation for distance-based models)")
print("> Data export (ready for modeling)")
print("\n.....EDA EVIDENCE IMPLEMENTED.....")
print("> Skewed variables log-transformed as recommended")
print(f"\n Preprocessing completed successfully!!!!!!")
print(".....Ready for modeling phase with EDA-informed preprocessing")