In [1]:
# StackOverflow Survey - Advanced Feature Engineering
# Handles complex categorical variables specific to developer survey data

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("🛠️ STACKOVERFLOW FEATURE ENGINEERING")
print("="*50)

# Load the cleaned dataset from previous analysis
df = pd.read_csv('stackoverflow_salary_clean.csv')
TARGET_VARIABLE = 'ConvertedCompYearly'

print(f"Dataset shape: {df.shape}")
print(f"Target variable: {TARGET_VARIABLE}")

# Create a copy for feature engineering
df_features = df.copy()

🛠️ STACKOVERFLOW FEATURE ENGINEERING
Dataset shape: (21782, 114)
Target variable: ConvertedCompYearly


In [8]:
# ================================================================================
# STEP 1: Handle Multi-Response Categorical Variables
# ================================================================================
print(f"\n🔗 STEP 1: MULTI-RESPONSE CATEGORICAL ENCODING")
print("="*50)

# Define multi-response columns (common in StackOverflow survey)
MULTI_RESPONSE_COLUMNS = [
    'DevType',                    # Multiple developer types
    'LanguageHaveWorkedWith',     # Programming languages
    'LanguageWantToWorkWith',     # Languages want to learn
    'DatabaseHaveWorkedWith',     # Databases used
    'DatabaseWantToWorkWith',     # Databases want to learn
    'PlatformHaveWorkedWith',     # Platforms used
    'PlatformWantToWorkWith',     # Platforms want to learn
    'WebframeHaveWorkedWith',     # Web frameworks used
    'WebframeWantToWorkWith',     # Web frameworks want to learn
    'MiscTechHaveWorkedWith',     # Other technologies
    'MiscTechWantToWorkWith',     # Other technologies want to learn
    'ToolsTechHaveWorkedWith',    # Tools and technologies
    'ToolsTechWantToWorkWith',    # Tools want to learn
    'NEWCollabToolsHaveWorkedWith', # Collaboration tools
    'NEWCollabToolsWantToWorkWith', # Collaboration tools want to learn
    'OpSysProfessional',          # Operating systems
    'OpSysPersonal',              # Personal OS
]

# Filter to only include columns that exist in dataset
multi_response_cols = [col for col in MULTI_RESPONSE_COLUMNS if col in df_features.columns]
print(f"Multi-response columns found: {len(multi_response_cols)}")
for col in multi_response_cols:
    print(f"  • {col}")

def encode_multi_response_column(df, column_name, min_frequency=50, max_features=20):
    """
    Encode multi-response categorical column (semicolon-separated values)
    
    Parameters:
    - df: DataFrame
    - column_name: Name of the column to encode
    - min_frequency: Minimum frequency for a category to be included
    - max_features: Maximum number of features to create
    
    Returns:
    - DataFrame with new binary columns
    """
    if column_name not in df.columns:
        return pd.DataFrame()
    
    print(f"\n🔧 Encoding {column_name}:")

    # Extract all unique values
    all_values = []
    for responses in df[column_name].dropna():
        if pd.notna(responses):
            values = [val.strip() for val in str(responses).split(';')]
            all_values.extend(values)

    # Count frequencies
    value_counts = pd.Series(all_values).value_counts()
    
    # Filter by minimum frequency and max features
    top_values = value_counts[value_counts >= min_frequency].head(max_features)
    
    print(f"  • Total unique values: {len(value_counts)}")
    print(f"  • Values with ≥{min_frequency} occurrences: {len(top_values)}")
    print(f"  • Creating {len(top_values)} binary features")
    
    # Create binary columns
    encoded_df = pd.DataFrame()
    for value in top_values.index:
        # Clean column name
        safe_name = re.sub(r'[^a-zA-Z0-9_]', '_', value)
        column_name_clean = f"{column_name}_{safe_name}"
        
        # Create binary feature
        encoded_df[column_name_clean] = df[column_name].apply(
            lambda x: 1 if pd.notna(x) and value in str(x) else 0
        )
    
    return encoded_df



🔗 STEP 1: MULTI-RESPONSE CATEGORICAL ENCODING
Multi-response columns found: 15
  • DevType
  • LanguageHaveWorkedWith
  • LanguageWantToWorkWith
  • DatabaseHaveWorkedWith
  • DatabaseWantToWorkWith
  • PlatformHaveWorkedWith
  • PlatformWantToWorkWith
  • WebframeHaveWorkedWith
  • WebframeWantToWorkWith
  • MiscTechHaveWorkedWith
  • MiscTechWantToWorkWith
  • ToolsTechHaveWorkedWith
  • ToolsTechWantToWorkWith
  • NEWCollabToolsHaveWorkedWith
  • NEWCollabToolsWantToWorkWith


In [9]:
# Apply multi-response encoding to key technology columns
tech_columns_to_encode = [
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith', 
    'WebframeHaveWorkedWith',
    'PlatformHaveWorkedWith',
    'DevType'
]

for col in tech_columns_to_encode:
    if col in df_features.columns:
        encoded_features = encode_multi_response_column(df_features, col, min_frequency=100, max_features=15)
        df_features = pd.concat([df_features, encoded_features], axis=1)


🔧 Encoding LanguageHaveWorkedWith:
  • Total unique values: 49
  • Values with ≥100 occurrences: 15
  • Creating 15 binary features

🔧 Encoding DatabaseHaveWorkedWith:
  • Total unique values: 35
  • Values with ≥100 occurrences: 15
  • Creating 15 binary features

🔧 Encoding WebframeHaveWorkedWith:
  • Total unique values: 36
  • Values with ≥100 occurrences: 15
  • Creating 15 binary features

🔧 Encoding PlatformHaveWorkedWith:
  • Total unique values: 27
  • Values with ≥100 occurrences: 15
  • Creating 15 binary features

🔧 Encoding DevType:
  • Total unique values: 34
  • Values with ≥100 occurrences: 15
  • Creating 15 binary features


In [10]:
# ================================================================================
# STEP 2: Handle Simple Categorical Variables
# ================================================================================
print(f"\n📝 STEP 2: SIMPLE CATEGORICAL ENCODING")
print("="*50)

# Define simple categorical columns and their encoding strategy
SIMPLE_CATEGORICAL_COLUMNS = {
    'Country': 'frequency',           # High cardinality - use frequency encoding
    'EdLevel': 'ordinal',            # Ordinal - use custom ordinal encoding
    'Employment': 'onehot',          # Low cardinality - one-hot encode
    'CompanyType': 'onehot',         # Low cardinality - one-hot encode
    'OrgSize': 'ordinal',            # Ordinal - use custom ordinal encoding
    'WorkRemote': 'onehot',          # Low cardinality - one-hot encode
    'MainBranch': 'onehot',          # Low cardinality - one-hot encode
    'Gender': 'onehot',              # Low cardinality - one-hot encode
    'JobSat': 'ordinal',             # Ordinal satisfaction levels
    'CareerSat': 'ordinal',          # Ordinal satisfaction levels
    'MentalHealth': 'onehot',        # Low cardinality - one-hot encode
    'Ethnicity': 'frequency',        # High cardinality - use frequency encoding
    'OpSysProfessional': 'onehot',   # Low cardinality - one-hot encode
    'OpSysPersonal': 'onehot',       # Low cardinality - one-hot encode
}

# Filter to only include columns that exist
simple_categorical_cols = {k: v for k, v in SIMPLE_CATEGORICAL_COLUMNS.items() if k in df_features.columns}
print(f"Simple categorical columns found: {len(simple_categorical_cols)}")


📝 STEP 2: SIMPLE CATEGORICAL ENCODING
Simple categorical columns found: 6


In [13]:
def encode_ordinal_column(df, column_name, custom_order=None):
    """
    Encode ordinal categorical column with proper ordering
    """
    if column_name not in df.columns:
        return pd.Series()
    
    print(f"  🔢 Ordinal encoding: {column_name}")
    
    # Define custom orderings for common StackOverflow columns
    ordinal_mappings = {
        'EdLevel': {
            'Primary/elementary school': 1,
            'Secondary school': 2,
            'Some college/university study without earning a degree': 3,
            'Associate degree': 4,
            "Bachelor's degree": 5,
            "Master's degree": 6,
            'Professional degree': 7,
            'Doctorate': 8
        },
        'OrgSize': {
            'Just me - I am a freelancer, sole proprietor, etc.': 1,
            '2-9 employees': 2,
            '10-19 employees': 3,
            '20-99 employees': 4,
            '100-499 employees': 5,
            '500-999 employees': 6,
            '1,000-4,999 employees': 7,
            '5,000-9,999 employees': 8,
            '10,000 or more employees': 9
        },
        'JobSat': {
            'Very dissatisfied': 1,
            'Slightly dissatisfied': 2,
            'Neither satisfied nor dissatisfied': 3,
            'Slightly satisfied': 4,
            'Very satisfied': 5
        },
        'CareerSat': {
            'Very dissatisfied': 1,
            'Slightly dissatisfied': 2,
            'Neither satisfied nor dissatisfied': 3,
            'Slightly satisfied': 4,
            'Very satisfied': 5
        }
    }
    
    if custom_order:
        mapping = custom_order
    elif column_name in ordinal_mappings:
        mapping = ordinal_mappings[column_name]
    else:
        # Default: use alphabetical order
        unique_values = sorted(df[column_name].dropna().unique())
        mapping = {val: i+1 for i, val in enumerate(unique_values)}
    
    print(f"    • Mapping: {mapping}")
    encoded_series = df[column_name].map(mapping)
    
    return encoded_series

In [14]:
def encode_frequency_column(df, column_name):
    """
    Encode high-cardinality categorical column using frequency encoding
    """
    if column_name not in df.columns:
        return pd.Series()
    
    print(f"  📊 Frequency encoding: {column_name}")
    
    # Calculate frequency of each category
    frequency_map = df[column_name].value_counts().to_dict()
    encoded_series = df[column_name].map(frequency_map)
    
    print(f"    • Unique categories: {len(frequency_map)}")
    print(f"    • Top 5 categories: {dict(list(frequency_map.items())[:5])}")
    
    return encoded_series

In [15]:
def encode_onehot_column(df, column_name, max_categories=10):
    """
    One-hot encode categorical column
    """
    if column_name not in df.columns:
        return pd.DataFrame()
    
    print(f"  🎯 One-hot encoding: {column_name}")
    
    # Limit categories to prevent too many features
    value_counts = df[column_name].value_counts()
    if len(value_counts) > max_categories:
        top_categories = value_counts.head(max_categories).index
        df_limited = df[column_name].apply(
            lambda x: x if x in top_categories else 'Other'
        )
        print(f"    • Limited to top {max_categories} categories (+ Other)")
    else:
        df_limited = df[column_name]
    
    # Create dummy variables
    dummy_df = pd.get_dummies(df_limited, prefix=column_name, drop_first=True)
    print(f"    • Created {dummy_df.shape[1]} binary features")
    
    return dummy_df

In [16]:
# Apply encoding strategies
for col_name, strategy in simple_categorical_cols.items():
    print(f"\n🔧 Processing {col_name} (strategy: {strategy}):")
    
    if strategy == 'ordinal':
        encoded_series = encode_ordinal_column(df_features, col_name)
        if not encoded_series.empty:
            df_features[f'{col_name}_encoded'] = encoded_series
    
    elif strategy == 'frequency':
        encoded_series = encode_frequency_column(df_features, col_name)
        if not encoded_series.empty:
            df_features[f'{col_name}_frequency'] = encoded_series
    
    elif strategy == 'onehot':
        encoded_df = encode_onehot_column(df_features, col_name)
        if not encoded_df.empty:
            df_features = pd.concat([df_features, encoded_df], axis=1)


🔧 Processing Country (strategy: frequency):
  📊 Frequency encoding: Country
    • Unique categories: 156
    • Top 5 categories: {'United States of America': 4590, 'Germany': 2020, 'United Kingdom of Great Britain and Northern Ireland': 1375, 'Ukraine': 1044, 'France': 908}

🔧 Processing EdLevel (strategy: ordinal):
  🔢 Ordinal encoding: EdLevel
    • Mapping: {'Primary/elementary school': 1, 'Secondary school': 2, 'Some college/university study without earning a degree': 3, 'Associate degree': 4, "Bachelor's degree": 5, "Master's degree": 6, 'Professional degree': 7, 'Doctorate': 8}

🔧 Processing Employment (strategy: onehot):
  🎯 One-hot encoding: Employment
    • Limited to top 10 categories (+ Other)
    • Created 10 binary features

🔧 Processing OrgSize (strategy: ordinal):
  🔢 Ordinal encoding: OrgSize
    • Mapping: {'Just me - I am a freelancer, sole proprietor, etc.': 1, '2-9 employees': 2, '10-19 employees': 3, '20-99 employees': 4, '100-499 employees': 5, '500-999 employees

In [17]:
# ================================================================================
# STEP 3: Handle Numerical Variables and Create Interactions
# ================================================================================
print(f"\n📊 STEP 3: NUMERICAL FEATURE ENGINEERING")
print("="*50)

# Identify numerical columns
numerical_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != TARGET_VARIABLE]

print(f"Numerical columns: {len(numerical_cols)}")


📊 STEP 3: NUMERICAL FEATURE ENGINEERING
Numerical columns: 92


In [20]:
# Handle specific numerical columns with domain knowledge
if 'YearsCodePro' in df_features.columns:
    print(f"\n🔧 Engineering YearsCodePro:")

    # Convert to numeric, coerce errors to NaN
    df_features['YearsCodePro'] = pd.to_numeric(df_features['YearsCodePro'], errors='coerce')
    
    # Create experience categories
    df_features['Experience_Category'] = pd.cut(
        df_features['YearsCodePro'], 
        bins=[0, 2, 5, 10, 20, 50], 
        labels=['Junior', 'Mid', 'Senior', 'Lead', 'Expert']
    )
    
    # Create experience squared (for non-linear relationships)
    df_features['YearsCodePro_squared'] = df_features['YearsCodePro'] ** 2
    
    print(f"  • Created experience categories")
    print(f"  • Created experience squared feature")

if 'Age' in df_features.columns:
    print(f"\n🔧 Engineering Age:")
    
    # Convert to numeric, coerce errors to NaN
    df_features['Age'] = pd.to_numeric(df_features['Age'], errors='coerce')
    
    # Create age categories
    df_features['Age_Category'] = pd.cut(
        df_features['Age'], 
        bins=[0, 25, 35, 45, 55, 100], 
        labels=['Young', 'Early_Career', 'Mid_Career', 'Senior_Career', 'Veteran']
    )
    
    print(f"  • Created age categories")

if 'WorkWeekHrs' in df_features.columns:
    print(f"\n🔧 Engineering WorkWeekHrs:")
    
    # Convert to numeric, coerce errors to NaN
    df_features['WorkWeekHrs'] = pd.to_numeric(df_features['WorkWeekHrs'], errors='coerce')
    
    # Create work-life balance categories
    df_features['Work_Life_Balance'] = pd.cut(
        df_features['WorkWeekHrs'], 
        bins=[0, 35, 40, 45, 50, 100], 
        labels=['Part_Time', 'Standard', 'Moderate_Overtime', 'High_Overtime', 'Extreme_Overtime']
    )
    
    print(f"  • Created work-life balance categories")


🔧 Engineering YearsCodePro:
  • Created experience categories
  • Created experience squared feature

🔧 Engineering Age:
  • Created age categories


In [21]:
# Create interaction features between important variables
print(f"\n🔗 Creating interaction features:")

interaction_pairs = [
    ('YearsCodePro', 'EdLevel_encoded'),
    ('YearsCodePro', 'OrgSize_encoded'),
    ('Age', 'YearsCodePro'),
]

for col1, col2 in interaction_pairs:
    if col1 in df_features.columns and col2 in df_features.columns:
        # Multiplicative interaction
        df_features[f'{col1}_x_{col2}'] = df_features[col1] * df_features[col2]
        print(f"  • Created {col1} × {col2}")


🔗 Creating interaction features:
  • Created YearsCodePro × EdLevel_encoded
  • Created YearsCodePro × OrgSize_encoded
  • Created Age × YearsCodePro


In [22]:
# ================================================================================
# STEP 4: Feature Selection and Final Preparation
# ================================================================================
print(f"\n🎯 STEP 4: FEATURE SELECTION AND FINAL PREPARATION")
print("="*50)

# Remove original categorical columns that were encoded
original_categorical_cols = list(simple_categorical_cols.keys()) + multi_response_cols
original_categorical_cols = [col for col in original_categorical_cols if col in df_features.columns]

print(f"Removing {len(original_categorical_cols)} original categorical columns")

# Keep the target variable and remove original categoricals
columns_to_keep = [col for col in df_features.columns if col not in original_categorical_cols or col == TARGET_VARIABLE]
df_final = df_features[columns_to_keep].copy()

print(f"Dataset shape after feature engineering: {df_final.shape}")


🎯 STEP 4: FEATURE SELECTION AND FINAL PREPARATION
Removing 21 original categorical columns
Dataset shape after feature engineering: (21782, 189)


In [24]:
# Handle any remaining missing values
print(f"\nHandling missing values:")
missing_counts = df_final.isnull().sum()
columns_with_missing = missing_counts[missing_counts > 0]

if len(columns_with_missing) > 0:
    print(f"Columns with missing values: {len(columns_with_missing)}")
    
    for col in columns_with_missing.index:
        if df_final[col].dtype in ['float64', 'int64']:
            # Fill numerical columns with median
            df_final[col] = df_final[col].fillna(df_final[col].median())
        else:
            # Check if column is categorical
            if pd.api.types.is_categorical_dtype(df_final[col]):
                # Add 'Unknown' to the categories first if using it
                if len(df_final[col].mode()) > 0:
                    fill_value = df_final[col].mode().iloc[0]
                    df_final[col] = df_final[col].fillna(fill_value)
                else:
                    # Add 'Unknown' to categories before filling
                    df_final[col] = df_final[col].cat.add_categories(['Unknown'])
                    df_final[col] = df_final[col].fillna('Unknown')
            else:
                # For non-categorical columns
                df_final[col] = df_final[col].fillna(df_final[col].mode().iloc[0] if len(df_final[col].mode()) > 0 else 'Unknown')
    
    print(f"✅ Missing values handled")
else:
    print(f"✅ No missing values found")


Handling missing values:
Columns with missing values: 6
✅ Missing values handled


In [29]:
from sklearn.preprocessing import OneHotEncoder  # Fixed the import statement
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # Added imputer for handling missing values
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd

# First, check that X and y have the same number of rows
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Make sure categorical_cols and numeric_cols are defined correctly
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessing pipeline with imputation steps
# Remove the 'passthrough' step which is causing issues
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_cols),  # Simplified numeric pipeline
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# The beginning of your code remains the same until after pipeline.fit(X_train, y_train)

# Get feature importance
model = pipeline.named_steps['model']
preprocessor = pipeline.named_steps['preprocessor']

# Get feature names after transformation - CORRECTED APPROACH
# Use get_feature_names_out() method from the entire preprocessor
try:
    # For scikit-learn >= 1.0
    feature_names = preprocessor.get_feature_names_out()
except AttributeError:
    # For older scikit-learn versions
    # This is a more complex fallback approach
    feature_names = []
    
    # Add numeric feature names
    if len(numeric_cols) > 0:
        feature_names.extend(numeric_cols)
    
    # Add transformed categorical feature names
    if len(categorical_cols) > 0:
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_feature_names = ohe.get_feature_names_out(categorical_cols)
        feature_names.extend(cat_feature_names)

# Ensure feature_names length matches feature_importances_ length
if len(feature_names) != len(model.feature_importances_):
    print(f"Warning: Feature names length ({len(feature_names)}) doesn't match "
          f"feature importances length ({len(model.feature_importances_)})")
    # Use generic feature names as fallback
    feature_names = [f'feature_{i}' for i in range(len(model.feature_importances_))]

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 20 most important features:")
print(feature_importance.head(20))

# Basic model performance
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBaseline Random Forest performance:")
print(f"  • R² Score: {r2:.3f}")
print(f"  • Mean Absolute Error: ${mae:,.0f}")

X shape: (21782, 188), y shape: (21782,)

Top 20 most important features:
                                      feature  importance
2                              num__CompTotal    0.467174
88                     num__Country_frequency    0.315317
1                           num__YearsCodePro    0.031459
91                  num__YearsCodePro_squared    0.030669
6358        cat__Currency_BRL\tBrazilian real    0.018735
6376          cat__Currency_EUR European Euro    0.013996
6387    cat__Currency_ILS\tIsraeli new shekel    0.010748
6447  cat__Currency_USD\tUnited States dollar    0.009626
6363           cat__Currency_CHF\tSwiss franc    0.007960
6362       cat__Currency_CAD\tCanadian dollar    0.006307
6351     cat__Currency_AUD\tAustralian dollar    0.006269
6378        cat__Currency_GBP\tPound sterling    0.005770
6426          cat__Currency_PLN\tPolish zloty    0.004440
6371          cat__Currency_DKK\tDanish krone    0.004338
6435      cat__Currency_SGD\tSingapore dollar    0.00401

In [30]:
# ================================================================================
# STEP 6: Save Processed Data
# ================================================================================
print(f"\n💾 STEP 6: SAVING PROCESSED DATA")
print("="*50)

# Save the final dataset
df_final.to_csv('stackoverflow_features_engineered.csv', index=False)
print(f"✅ Feature engineered dataset saved: 'stackoverflow_features_engineered.csv'")

# Save feature importance
feature_importance.to_csv('feature_importance.csv', index=False)
print(f"✅ Feature importance saved: 'feature_importance.csv'")

# Save feature engineering summary
feature_summary = {
    'original_shape': df.shape,
    'final_shape': df_final.shape,
    'features_added': df_final.shape[1] - df.shape[1],
    'multi_response_columns_processed': len([col for col in multi_response_cols if col in df.columns]),
    'simple_categorical_columns_processed': len(simple_categorical_cols),
    'top_features': feature_importance.head(10)['feature'].tolist(),
    'baseline_r2_score': r2,
    'baseline_mae': mae
}

import json
with open('feature_engineering_summary.json', 'w') as f:
    json.dump(feature_summary, f, indent=2, default=str)

print(f"✅ Feature engineering summary saved: 'feature_engineering_summary.json'")



💾 STEP 6: SAVING PROCESSED DATA
✅ Feature engineered dataset saved: 'stackoverflow_features_engineered.csv'
✅ Feature importance saved: 'feature_importance.csv'
✅ Feature engineering summary saved: 'feature_engineering_summary.json'


In [31]:
# ================================================================================
# STEP 7: Feature Engineering Report
# ================================================================================
print(f"\n📋 FEATURE ENGINEERING REPORT")
print("="*50)

print(f"🎯 TRANSFORMATION SUMMARY:")
print(f"  • Original dataset: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"  • Final dataset: {df_final.shape[0]:,} rows × {df_final.shape[1]} columns")
print(f"  • Features added: {df_final.shape[1] - df.shape[1]}")
print(f"  • Multi-response columns processed: {len([col for col in multi_response_cols if col in df.columns])}")
print(f"  • Simple categorical columns processed: {len(simple_categorical_cols)}")

print(f"\n🏆 TOP 10 MOST IMPORTANT FEATURES:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"  {i:2d}. {row['feature']}: {row['importance']:.3f}")

print(f"\n📊 BASELINE MODEL PERFORMANCE:")
print(f"  • R² Score: {r2:.3f} (explains {r2*100:.1f}% of salary variance)")
print(f"  • Mean Absolute Error: ${mae:,.0f}")
print(f"  • Model predicts salary within ${mae:,.0f} on average")

print(f"\n🚀 READY FOR ADVANCED MODELING!")
print(f"Next steps:")
print(f"  1. Advanced model training (Random Forest, Gradient Boosting, etc.)")
print(f"  2. Hyperparameter tuning")
print(f"  3. Model evaluation and comparison")
print(f"  4. Creative scenario predictions")
print(f"  5. Blog post writing with insights")

print(f"\n✅ FEATURE ENGINEERING COMPLETE!")


📋 FEATURE ENGINEERING REPORT
🎯 TRANSFORMATION SUMMARY:
  • Original dataset: 21,782 rows × 114 columns
  • Final dataset: 21,782 rows × 189 columns
  • Features added: 75
  • Multi-response columns processed: 15
  • Simple categorical columns processed: 6

🏆 TOP 10 MOST IMPORTANT FEATURES:
   1. num__CompTotal: 0.467
   2. num__Country_frequency: 0.315
   3. num__YearsCodePro: 0.031
   4. num__YearsCodePro_squared: 0.031
   5. cat__Currency_BRL	Brazilian real: 0.019
   6. cat__Currency_EUR European Euro: 0.014
   7. cat__Currency_ILS	Israeli new shekel: 0.011
   8. cat__Currency_USD	United States dollar: 0.010
   9. cat__Currency_CHF	Swiss franc: 0.008
  10. cat__Currency_CAD	Canadian dollar: 0.006

📊 BASELINE MODEL PERFORMANCE:
  • R² Score: 0.952 (explains 95.2% of salary variance)
  • Mean Absolute Error: $2,947
  • Model predicts salary within $2,947 on average

🚀 READY FOR ADVANCED MODELING!
Next steps:
  1. Advanced model training (Random Forest, Gradient Boosting, etc.)
  2. Hy