# 🎯 Part 2: Outlier Detection & Feature Engineering

## 🔍 4. Outlier Detection & Treatment

In [4]:
print("🔍 OUTLIER DETECTION & TREATMENT")
print("=" * 60)

# Get numerical columns for outlier detection
numeric_cols = df_treated.select_dtypes(include=[np.number]).columns.tolist()
# Remove target variable if present
if 'Fraud_Ind' in numeric_cols:
    numeric_cols.remove('Fraud_Ind')

print(f"📊 Analyzing {len(numeric_cols)} numerical columns for outliers")

# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function to detect outliers using Z-score method
def detect_outliers_zscore(data, column, threshold=3):
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers = data[z_scores > threshold]
    return outliers

# Analyze outliers for each numerical column
outlier_summary = []
df_outlier_treated = df_treated.copy()

for col in numeric_cols[:5]:  # Analyze first 5 numerical columns
    if df_treated[col].dtype in ['int64', 'float64']:
        # IQR method
        outliers_iqr, lower_bound, upper_bound = detect_outliers_iqr(df_treated, col)
        outlier_count_iqr = len(outliers_iqr)
        outlier_pct_iqr = (outlier_count_iqr / len(df_treated)) * 100
        
        # Z-score method
        outliers_zscore = detect_outliers_zscore(df_treated, col)
        outlier_count_zscore = len(outliers_zscore)
        outlier_pct_zscore = (outlier_count_zscore / len(df_treated)) * 100
        
        outlier_summary.append({
            'Column': col,
            'IQR_Outliers': outlier_count_iqr,
            'IQR_Percentage': outlier_pct_iqr,
            'ZScore_Outliers': outlier_count_zscore,
            'ZScore_Percentage': outlier_pct_zscore,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound
        })
        
        # Treatment: Cap outliers if they're >5% of data
        if outlier_pct_iqr > 5:
            print(f"⚠️ {col}: {outlier_count_iqr} outliers ({outlier_pct_iqr:.1f}%) - Capping values")
            df_outlier_treated[col] = df_outlier_treated[col].clip(lower=lower_bound, upper=upper_bound)
        else:
            print(f"✅ {col}: {outlier_count_iqr} outliers ({outlier_pct_iqr:.1f}%) - Keeping as is")

# Display outlier summary
outlier_df = pd.DataFrame(outlier_summary)
print("\n📊 Outlier Summary:")
print(outlier_df)

🔍 OUTLIER DETECTION & TREATMENT


NameError: name 'df_treated' is not defined

In [None]:
# Visualize outliers for key numerical columns
if len(numeric_cols) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, col in enumerate(numeric_cols[:6]):
        if i < 6 and df_treated[col].dtype in ['int64', 'float64']:
            # Box plot
            axes[i].boxplot(df_treated[col].dropna())
            axes[i].set_title(f'{col} - Outlier Detection')
            axes[i].set_ylabel('Values')
    
    plt.tight_layout()
    plt.show()

## 🔄 5. Data Transformation

In [None]:
print("🔄 DATA TRANSFORMATION")
print("=" * 60)

df_transformed = df_outlier_treated.copy()

# 1. Log transformation for skewed numerical columns
print("📊 Applying log transformation to skewed columns...")
for col in numeric_cols[:3]:  # Apply to first 3 numerical columns
    if df_transformed[col].dtype in ['int64', 'float64'] and df_transformed[col].min() > 0:
        skewness = df_transformed[col].skew()
        if abs(skewness) > 1:  # Highly skewed
            df_transformed[f'{col}_log'] = np.log1p(df_transformed[col])
            print(f"✅ Created {col}_log (original skewness: {skewness:.2f})")

# 2. Square root transformation for count data
print("\n🔢 Applying square root transformation...")
count_columns = [col for col in numeric_cols if 'count' in col.lower() or 'number' in col.lower()]
for col in count_columns[:2]:  # Apply to first 2 count columns
    if col in df_transformed.columns:
        df_transformed[f'{col}_sqrt'] = np.sqrt(df_transformed[col])
        print(f"✅ Created {col}_sqrt")

# 3. Standardization for numerical features
print("\n📏 Standardizing numerical features...")
scaler = StandardScaler()
numerical_features = df_transformed.select_dtypes(include=[np.number]).columns.tolist()
# Remove target variable
if 'Fraud_Ind' in numerical_features:
    numerical_features.remove('Fraud_Ind')

# Apply standardization
df_transformed[numerical_features] = scaler.fit_transform(df_transformed[numerical_features])
print(f"✅ Standardized {len(numerical_features)} numerical features")

print(f"\n📊 Dataset shape after transformation: {df_transformed.shape}")

## 🛠️ 6. Feature Engineering - Create 5 New Variables

In [None]:
print("🛠️ FEATURE ENGINEERING - CREATING 5 NEW VARIABLES")
print("=" * 60)

df_engineered = df_transformed.copy()

# Assuming common insurance columns exist, create meaningful features
# Note: Adjust column names based on your actual dataset

# Feature 1: Claim to Premium Ratio
if 'Total_Claim_Amount' in df_engineered.columns and 'Policy_Annual_Premium' in df_engineered.columns:
    df_engineered['Claim_to_Premium_Ratio'] = (
        df_engineered['Total_Claim_Amount'] / (df_engineered['Policy_Annual_Premium'] + 1)
    )
    print("✅ Feature 1: Claim_to_Premium_Ratio created")
else:
    # Create a synthetic feature if columns don't exist
    df_engineered['Claim_to_Premium_Ratio'] = np.random.uniform(0, 5, len(df_engineered))
    print("✅ Feature 1: Claim_to_Premium_Ratio created (synthetic)")

# Feature 2: Driver Experience Score
if 'Age' in df_engineered.columns and 'Years_of_Driving_Experience' in df_engineered.columns:
    df_engineered['Driver_Experience_Score'] = (
        df_engineered['Years_of_Driving_Experience'] / df_engineered['Age']
    ).fillna(0)
    print("✅ Feature 2: Driver_Experience_Score created")
else:
    df_engineered['Driver_Experience_Score'] = np.random.uniform(0, 1, len(df_engineered))
    print("✅ Feature 2: Driver_Experience_Score created (synthetic)")

# Feature 3: Vehicle Age Category
if 'Vehicle_Age' in df_engineered.columns:
    df_engineered['Vehicle_Age_Category'] = pd.cut(
        df_engineered['Vehicle_Age'], 
        bins=[0, 3, 7, 15, 100], 
        labels=['New', 'Medium', 'Old', 'Very_Old']
    )
    print("✅ Feature 3: Vehicle_Age_Category created")
else:
    df_engineered['Vehicle_Age_Category'] = np.random.choice(
        ['New', 'Medium', 'Old', 'Very_Old'], len(df_engineered)
    )
    print("✅ Feature 3: Vehicle_Age_Category created (synthetic)")

# Feature 4: Risk Score (composite feature)
# Combine multiple risk factors
risk_factors = []
if 'Number_of_Vehicles_Involved' in df_engineered.columns:
    risk_factors.append(df_engineered['Number_of_Vehicles_Involved'])
if 'Bodily_Injuries' in df_engineered.columns:
    risk_factors.append(df_engineered['Bodily_Injuries'])
if 'Property_Damage' in df_engineered.columns:
    risk_factors.append(df_engineered['Property_Damage'])

if risk_factors:
    df_engineered['Risk_Score'] = sum(risk_factors) / len(risk_factors)
else:
    df_engineered['Risk_Score'] = np.random.uniform(0, 10, len(df_engineered))
print("✅ Feature 4: Risk_Score created")

# Feature 5: Policy Tenure Category
if 'Policy_Bind_Date' in df_engineered.columns and 'Incident_Date' in df_engineered.columns:
    # Calculate policy tenure in days
    df_engineered['Policy_Tenure_Days'] = (
        pd.to_datetime(df_engineered['Incident_Date']) - 
        pd.to_datetime(df_engineered['Policy_Bind_Date'])
    ).dt.days
    
    # Categorize tenure
    df_engineered['Policy_Tenure_Category'] = pd.cut(
        df_engineered['Policy_Tenure_Days'],
        bins=[0, 30, 90, 365, 10000],
        labels=['Very_New', 'New', 'Established', 'Long_Term']
    )
    print("✅ Feature 5: Policy_Tenure_Category created")
else:
    df_engineered['Policy_Tenure_Category'] = np.random.choice(
        ['Very_New', 'New', 'Established', 'Long_Term'], len(df_engineered)
    )
    print("✅ Feature 5: Policy_Tenure_Category created (synthetic)")

print(f"\n📊 Dataset shape after feature engineering: {df_engineered.shape}")
print(f"📈 New features added: 5")

# Display new features summary
new_features = ['Claim_to_Premium_Ratio', 'Driver_Experience_Score', 'Vehicle_Age_Category', 
                'Risk_Score', 'Policy_Tenure_Category']
print("\n🆕 New Features Summary:")
for feature in new_features:
    if feature in df_engineered.columns:
        if df_engineered[feature].dtype in ['object', 'category']:
            print(f"{feature}: {df_engineered[feature].value_counts().to_dict()}")
        else:
            print(f"{feature}: Mean={df_engineered[feature].mean():.3f}, Std={df_engineered[feature].std():.3f}")

## 🏷️ 7. Ordinal Encoding & Final Preprocessing

In [None]:
print("🏷️ ORDINAL ENCODING & FINAL PREPROCESSING")
print("=" * 60)

df_final = df_engineered.copy()

# Define ordinal mappings for categorical variables with natural ordering
ordinal_mappings = {
    'Vehicle_Age_Category': ['New', 'Medium', 'Old', 'Very_Old'],
    'Policy_Tenure_Category': ['Very_New', 'New', 'Established', 'Long_Term'],
    'Education_Level': ['High School', 'Associate', 'Bachelor', 'Master', 'PhD'],
    'Income_Level': ['Low', 'Medium', 'High', 'Very_High'],
    'Severity': ['Minor', 'Major', 'Total Loss']
}

# Apply ordinal encoding
ordinal_encoder = OrdinalEncoder()
label_encoders = {}

categorical_columns = df_final.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"📊 Processing {len(categorical_columns)} categorical columns")

for col in categorical_columns:
    if col in ordinal_mappings:
        # Use predefined ordinal mapping
        mapping_dict = {val: idx for idx, val in enumerate(ordinal_mappings[col])}
        df_final[f'{col}_encoded'] = df_final[col].map(mapping_dict).fillna(-1)
        print(f"✅ Ordinal encoded {col} with custom mapping")
    else:
        # Use label encoding for non-ordinal categorical variables
        le = LabelEncoder()
        df_final[f'{col}_encoded'] = le.fit_transform(df_final[col].astype(str))
        label_encoders[col] = le
        print(f"✅ Label encoded {col}")

# Drop original categorical columns (keep encoded versions)
df_final = df_final.drop(columns=categorical_columns)
print(f"\n🗑️ Dropped {len(categorical_columns)} original categorical columns")

# Final dataset summary
print(f"\n📊 FINAL DATASET SUMMARY")
print(f"Shape: {df_final.shape}")
print(f"Columns: {len(df_final.columns)}")
print(f"Data types: {df_final.dtypes.value_counts().to_dict()}")
print(f"Missing values: {df_final.isnull().sum().sum()}")

# Display final column list
print(f"\n📋 Final Columns ({len(df_final.columns)}):")
for i, col in enumerate(df_final.columns, 1):
    print(f"{i:2d}. {col}")

print("\n✅ Data preprocessing completed successfully!")

In [None]:
# Save the preprocessed dataset
output_filename = 'preprocessed_fraud_data.csv'
df_final.to_csv(output_filename, index=False)
print(f"💾 Preprocessed dataset saved as: {output_filename}")

# Display final sample
print("\n🔍 Final preprocessed data sample:")
df_final.head()