In [None]:
# Fix for wmic error in Windows
import os
os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())
print(f"Setting max CPU count to: {os.environ['LOKY_MAX_CPU_COUNT']}")

# For older joblib versions, you might also need:
os.environ["JOBLIB_TEMP_FOLDER"] = os.path.join(os.path.expanduser("~"), "temp_joblib")
if not os.path.exists(os.environ["JOBLIB_TEMP_FOLDER"]):
    os.makedirs(os.environ["JOBLIB_TEMP_FOLDER"])

In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn optuna xgboost lightgbm catboost imbalanced-learn category_encoders joblib

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.2.5-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting optuna
  Using cached optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp313-cp313-win_amd64.whl.metadata (1.5 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting category_encoders
  Using cached category_encoders-2.8.1-py3-none-any.whl.metadat

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import StackingClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from category_encoders import TargetEncoder, CatBoostEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings
import re
import os
from datetime import datetime
import joblib

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
np.random.seed(42)

# ----------------- Step 1: Load and Prepare Data with Better Error Handling -----------------

def load_data():
    """Load datasets with enhanced validation and handling of edge cases."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        # Check for required columns
        expected_cols = {'obs', 'job_title', 'job_posted_date', 'job_state', 'feature_1'}
        if not expected_cols.issubset(train_df.columns):
            raise ValueError(f"Essential columns missing: {expected_cols - set(train_df.columns)}")
        
        # Ensure train data has target column
        if 'salary_category' not in train_df.columns:
            raise ValueError("Training data is missing target column 'salary_category'")
        
        # Check for consistent data types between train and test
        for col in test_df.columns:
            if col in train_df.columns and train_df[col].dtype != test_df[col].dtype:
                print(f"Warning: Column {col} has different types in train and test. Converting to consistent type.")
                # Try to make types consistent
                if pd.api.types.is_numeric_dtype(train_df[col]):
                    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
                else:
                    test_df[col] = test_df[col].astype(str)
        
        print(f"Train data shape: {train_df.shape}")
        print(f"Test data shape: {test_df.shape}")
        
        return train_df, test_df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

# ----------------- Step 2: Enhanced Feature Engineering -----------------

def engineer_features(train_df, test_df, is_training=True):
    """Apply comprehensive feature engineering with better handling of edge cases."""
    
    # Make copies to avoid modifying original
    X_train = train_df.copy()
    X_test = test_df.copy()
    
    # Extract target if training
    if is_training and 'salary_category' in X_train.columns:
        y = X_train['salary_category'].copy()
        # Map categories to numerical values
        label_mapping = {'High': 2, 'Medium': 1, 'Low': 0}
        y = y.map(label_mapping)
        X_train = X_train.drop(columns=['salary_category'])
    else:
        y = None
    
    # Remove obs column
    if 'obs' in X_train.columns:
        X_train = X_train.drop(columns=['obs'])
    if 'obs' in X_test.columns:
        test_obs = X_test['obs'].copy()
        X_test = X_test.drop(columns=['obs'])
    else:
        test_obs = None
    
    # Define feature types
    numerical_cols = ['feature_2', 'feature_9', 'feature_12']
    categorical_cols = ['job_title', 'job_state', 'feature_1']
    boolean_cols = ['feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 
                   'feature_8', 'feature_10', 'feature_11']
    job_desc_cols = [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]
    date_cols = ['job_posted_date']
    
    # ----- Handle missing values -----
    
    # For numerical columns
    num_imputer = KNNImputer(n_neighbors=5)
    for col in numerical_cols:
        if col in X_train.columns:
            # Convert to numeric first, handling non-numeric values
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
            
            # Reshape for imputer
            X_train_reshaped = X_train[col].values.reshape(-1, 1)
            X_train[col] = num_imputer.fit_transform(X_train_reshaped).ravel()
            
            X_test_reshaped = X_test[col].values.reshape(-1, 1)
            X_test[col] = num_imputer.transform(X_test_reshaped).ravel()
    
    # For categorical columns
    for col in categorical_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna('Unknown')
            X_test[col] = X_test[col].fillna('Unknown')
    
    # For boolean columns
    for col in boolean_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna(0).astype(int)
            X_test[col] = X_test[col].fillna(0).astype(int)
    
    # For job description columns - better handling of sparse feature matrices
    for col in job_desc_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna(0)
            X_test[col] = X_test[col].fillna(0)
    
    # ----- Job Title Feature Extraction -----
    
    if 'job_title' in X_train.columns:
        # Extract seniority level
        X_train['is_senior'] = X_train['job_title'].str.contains(
            'senior|sr|lead|principal|staff|architect|head', case=False).astype(int)
        X_test['is_senior'] = X_test['job_title'].str.contains(
            'senior|sr|lead|principal|staff|architect|head', case=False).astype(int)
        
        X_train['is_junior'] = X_train['job_title'].str.contains(
            'junior|jr|associate|entry|intern', case=False).astype(int)
        X_test['is_junior'] = X_test['job_title'].str.contains(
            'junior|jr|associate|entry|intern', case=False).astype(int)
        
        # Extract role type
        X_train['is_manager'] = X_train['job_title'].str.contains(
            'manager|director|lead|head', case=False).astype(int)
        X_test['is_manager'] = X_test['job_title'].str.contains(
            'manager|director|lead|head', case=False).astype(int)
        
        X_train['is_developer'] = X_train['job_title'].str.contains(
            'developer|engineer|programmer|coder', case=False).astype(int)
        X_test['is_developer'] = X_test['job_title'].str.contains(
            'developer|engineer|programmer|coder', case=False).astype(int)
        
        X_train['is_data'] = X_train['job_title'].str.contains(
            'data|scientist|analyst|analytics', case=False).astype(int)
        X_test['is_data'] = X_test['job_title'].str.contains(
            'data|scientist|analyst|analytics', case=False).astype(int)
        
        # Target encoding for job title
        if is_training:
            title_encoder = TargetEncoder()
            X_train['job_title_encoded'] = title_encoder.fit_transform(X_train[['job_title']], y)
            # Save encoder for test predictions
            joblib.dump(title_encoder, 'job_title_encoder.joblib')
        else:
            try:
                title_encoder = joblib.load('job_title_encoder.joblib')
                X_test['job_title_encoded'] = title_encoder.transform(X_test[['job_title']])
            except:
                print("Warning: Job title encoder not found. Using dummy values.")
                X_test['job_title_encoded'] = 0.5
        
        # Also use CatBoost encoder for job title
        if is_training:
            cb_encoder = CatBoostEncoder()
            X_train['job_title_cb_encoded'] = cb_encoder.fit_transform(X_train[['job_title']], y)
            joblib.dump(cb_encoder, 'job_title_cb_encoder.joblib')
        else:
            try:
                cb_encoder = joblib.load('job_title_cb_encoder.joblib')
                X_test['job_title_cb_encoded'] = cb_encoder.transform(X_test[['job_title']])
            except:
                print("Warning: CatBoost encoder not found. Using dummy values.")
                X_test['job_title_cb_encoded'] = 0.5
    
    # ----- Date Feature Engineering -----
    
    if 'job_posted_date' in X_train.columns:
        # Handle date format with robust error handling
        def parse_date(date_str):
            try:
                if pd.isna(date_str):
                    return pd.Timestamp('2020-01-01')
                return pd.to_datetime(date_str, format='%Y/%m')
            except:
                return pd.Timestamp('2020-01-01')
        
        X_train['parsed_date'] = X_train['job_posted_date'].apply(parse_date)
        X_test['parsed_date'] = X_test['job_posted_date'].apply(parse_date)
        
        # Extract year and month
        X_train['job_posted_year'] = X_train['parsed_date'].dt.year
        X_train['job_posted_month'] = X_train['parsed_date'].dt.month
        X_test['job_posted_year'] = X_test['parsed_date'].dt.year
        X_test['job_posted_month'] = X_test['parsed_date'].dt.month
        
        # Create cyclical features for month
        X_train['month_sin'] = np.sin(2 * np.pi * X_train['job_posted_month']/12)
        X_train['month_cos'] = np.cos(2 * np.pi * X_train['job_posted_month']/12)
        X_test['month_sin'] = np.sin(2 * np.pi * X_test['job_posted_month']/12)
        X_test['month_cos'] = np.cos(2 * np.pi * X_test['job_posted_month']/12)
        
        # Create quarter feature
        X_train['job_quarter'] = ((X_train['job_posted_month'] - 1) // 3) + 1
        X_test['job_quarter'] = ((X_test['job_posted_month'] - 1) // 3) + 1
        
        # Drop intermediate columns
        X_train = X_train.drop(columns=['parsed_date', 'job_posted_date'])
        X_test = X_test.drop(columns=['parsed_date', 'job_posted_date'])
    
    # ----- Geographic Features -----
    
    # Define regions
    us_regions = {
        'Northeast': ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA'],
        'Midwest': ['OH', 'MI', 'IN', 'IL', 'WI', 'MN', 'IA', 'MO', 'ND', 'SD', 'NE', 'KS'],
        'South': ['DE', 'MD', 'DC', 'VA', 'WV', 'NC', 'SC', 'GA', 'FL', 'KY', 'TN', 'AL', 'MS', 'AR', 'LA', 'OK', 'TX'],
        'West': ['MT', 'ID', 'WY', 'CO', 'NM', 'AZ', 'UT', 'NV', 'CA', 'OR', 'WA', 'AK', 'HI']
    }
    
    if 'job_state' in X_train.columns:
        # Create region features
        for region, states in us_regions.items():
            X_train[f'region_{region}'] = X_train['job_state'].apply(lambda x: 1 if x in states else 0)
            X_test[f'region_{region}'] = X_test['job_state'].apply(lambda x: 1 if x in states else 0)
        
        # One-hot encode job_state
        if is_training:
            state_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            state_encoded = state_encoder.fit_transform(X_train[['job_state']])
            joblib.dump(state_encoder, 'state_encoder.joblib')
        else:
            state_encoder = joblib.load('state_encoder.joblib')
            state_encoded = state_encoder.transform(X_test[['job_state']])
        
        state_cols = [f'state_{col}' for col in state_encoder.get_feature_names_out(['job_state'])]
        if is_training:
            X_train_state = pd.DataFrame(state_encoded, columns=state_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_state], axis=1)
        else:
            X_test_state = pd.DataFrame(state_encoded, columns=state_cols, index=X_test.index)
            X_test = pd.concat([X_test, X_test_state], axis=1)
    
    # ----- Feature_1 Encoding -----
    
    if 'feature_1' in X_train.columns:
        if is_training:
            feat1_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            feat1_encoded = feat1_encoder.fit_transform(X_train[['feature_1']])
            joblib.dump(feat1_encoder, 'feat1_encoder.joblib')
        else:
            feat1_encoder = joblib.load('feat1_encoder.joblib')
            feat1_encoded = feat1_encoder.transform(X_test[['feature_1']])
        
        feat1_cols = [f'feat1_{col}' for col in feat1_encoder.get_feature_names_out(['feature_1'])]
        if is_training:
            X_train_feat1 = pd.DataFrame(feat1_encoded, columns=feat1_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_feat1], axis=1)
        else:
            X_test_feat1 = pd.DataFrame(feat1_encoded, columns=feat1_cols, index=X_test.index)
            X_test = pd.concat([X_test, X_test_feat1], axis=1)
    
    # ----- Numerical Feature Transformations -----
    
    # Apply power transformer for skewed numerical features
    if is_training:
        power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
        for col in numerical_cols:
            if col in X_train.columns:
                X_train[col] = power_transformer.fit_transform(X_train[[col]])
                joblib.dump(power_transformer, f'{col}_transformer.joblib')
    else:
        for col in numerical_cols:
            if col in X_test.columns and os.path.exists(f'{col}_transformer.joblib'):
                power_transformer = joblib.load(f'{col}_transformer.joblib')
                X_test[col] = power_transformer.transform(X_test[[col]])
    
    # Create polynomial features
    for col in numerical_cols:
        if col in X_train.columns:
            X_train[f'{col}_squared'] = X_train[col] ** 2
            X_train[f'{col}_cubed'] = X_train[col] ** 3
            X_test[f'{col}_squared'] = X_test[col] ** 2
            X_test[f'{col}_cubed'] = X_test[col] ** 3
    
    # Feature interactions between numerical features
    for i, col1 in enumerate(numerical_cols):
        for j, col2 in enumerate(numerical_cols):
            if i < j and col1 in X_train.columns and col2 in X_train.columns:
                X_train[f'{col1}_{col2}_interaction'] = X_train[col1] * X_train[col2]
                X_test[f'{col1}_{col2}_interaction'] = X_test[col1] * X_test[col2]
    
    # ----- Boolean Feature Engineering -----
    
    # Sum of boolean features
    if all(col in X_train.columns for col in boolean_cols):
        X_train['boolean_sum'] = X_train[boolean_cols].sum(axis=1)
        X_test['boolean_sum'] = X_test[boolean_cols].sum(axis=1)
        
        # Interactions between boolean features
        for i, col1 in enumerate(boolean_cols):
            for j, col2 in enumerate(boolean_cols):
                if i < j:
                    # AND interaction
                    X_train[f'{col1}_{col2}_and'] = X_train[col1] & X_train[col2]
                    X_test[f'{col1}_{col2}_and'] = X_test[col1] & X_test[col2]
                    
                    # OR interaction
                    X_train[f'{col1}_{col2}_or'] = X_train[col1] | X_train[col2]
                    X_test[f'{col1}_{col2}_or'] = X_test[col1] | X_test[col2]
    
    # ----- Job Description Feature Engineering -----
    # Check if job_desc columns exist
    if all(col in X_train.columns for col in job_desc_cols[:10]):  # Check first 10 columns
        # Calculate aggregate statistics for job descriptions
        X_train['job_desc_mean'] = X_train[job_desc_cols].mean(axis=1)
        X_train['job_desc_std'] = X_train[job_desc_cols].std(axis=1)
        X_train['job_desc_sum'] = X_train[job_desc_cols].sum(axis=1)
        X_train['job_desc_min'] = X_train[job_desc_cols].min(axis=1)
        X_train['job_desc_max'] = X_train[job_desc_cols].max(axis=1)
        X_train['job_desc_nonzero'] = (X_train[job_desc_cols] != 0).sum(axis=1)
        
        X_test['job_desc_mean'] = X_test[job_desc_cols].mean(axis=1)
        X_test['job_desc_std'] = X_test[job_desc_cols].std(axis=1)
        X_test['job_desc_sum'] = X_test[job_desc_cols].sum(axis=1)
        X_test['job_desc_min'] = X_test[job_desc_cols].min(axis=1)
        X_test['job_desc_max'] = X_test[job_desc_cols].max(axis=1)
        X_test['job_desc_nonzero'] = (X_test[job_desc_cols] != 0).sum(axis=1)
        
        # Advanced dimensionality reduction
        if is_training:
            # PCA for dense representation
            pca = PCA(n_components=25)
            job_desc_pca_train = pca.fit_transform(X_train[job_desc_cols])
            joblib.dump(pca, 'job_desc_pca.joblib')
            
            # SVD for sparse representation
            svd = TruncatedSVD(n_components=25)
            job_desc_svd_train = svd.fit_transform(X_train[job_desc_cols])
            joblib.dump(svd, 'job_desc_svd.joblib')
            
            # Add dimension reduction features to training data
            for i in range(25):
                X_train[f'pca_{i}'] = job_desc_pca_train[:, i]
                X_train[f'svd_{i}'] = job_desc_svd_train[:, i]
                
            # Clustering on job descriptions
            kmeans = KMeans(n_clusters=15, random_state=42, n_init=10)
            X_train['job_desc_cluster'] = kmeans.fit_predict(job_desc_pca_train)
            joblib.dump(kmeans, 'job_desc_kmeans.joblib')
            
            # Create one-hot encoding for clusters
            cluster_encoder = OneHotEncoder(sparse_output=False)
            cluster_encoded_train = cluster_encoder.fit_transform(X_train[['job_desc_cluster']])
            joblib.dump(cluster_encoder, 'cluster_encoder.joblib')
            
            cluster_cols = [f'cluster_{i}' for i in range(cluster_encoded_train.shape[1])]
            X_train_clusters = pd.DataFrame(cluster_encoded_train, columns=cluster_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_clusters], axis=1)
            
        else:
            # Apply transformations to test data separately
            pca = joblib.load('job_desc_pca.joblib')
            job_desc_pca_test = pca.transform(X_test[job_desc_cols])
            
            svd = joblib.load('job_desc_svd.joblib')
            job_desc_svd_test = svd.transform(X_test[job_desc_cols])
            
            # Add dimension reduction features to test data
            for i in range(25):
                X_test[f'pca_{i}'] = job_desc_pca_test[:, i]
                X_test[f'svd_{i}'] = job_desc_svd_test[:, i]
            
            # Apply clustering to test data
            kmeans = joblib.load('job_desc_kmeans.joblib')
            X_test['job_desc_cluster'] = kmeans.predict(job_desc_pca_test)
            
            # Apply one-hot encoding to test clusters
            cluster_encoder = joblib.load('cluster_encoder.joblib')
            cluster_encoded_test = cluster_encoder.transform(X_test[['job_desc_cluster']])
            
            cluster_cols = [f'cluster_{i}' for i in range(cluster_encoded_test.shape[1])]
            X_test_clusters = pd.DataFrame(cluster_encoded_test, columns=cluster_cols, index=X_test.index)
            X_test = pd.concat([X_test, X_test_clusters], axis=1)
        
        # Drop original job description columns to reduce dimensionality
        X_train = X_train.drop(columns=job_desc_cols)
        X_test = X_test.drop(columns=job_desc_cols)

    
    # Drop original categorical columns now that they're encoded
    cols_to_drop = ['job_title', 'job_state', 'feature_1']
    X_train = X_train.drop(columns=[c for c in cols_to_drop if c in X_train.columns])
    X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])
    
    # Ensure all columns in test exist in train
    for col in X_train.columns:
        if col not in X_test.columns:
            X_test[col] = 0
    
    # Ensure all columns in train exist in test
    for col in X_test.columns:
        if col not in X_train.columns:
            X_train[col] = 0
    
    # Make sure columns are in the same order
    X_test = X_test[X_train.columns]
    
    # Return prepared datasets
    if is_training:
        return X_train, X_test, y, test_obs
    else:
        return X_train, X_test, None, test_obs

# ----------------- Step 3: Advanced Model Building -----------------

def optimize_models(X, y):
    """Optimize multiple models with Optuna and proper cross-validation."""
    
    # Initialize stratified k-fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    best_models = {}
    
    # Define models to optimize
    models_to_optimize = {
        'XGBoost': optimize_xgboost,
        'LightGBM': optimize_lightgbm,
        'CatBoost': optimize_catboost
    }
    
    # Run optimization for each model
    for model_name, optimizer_func in models_to_optimize.items():
        print(f"\nOptimizing {model_name}...")
        best_model = optimizer_func(X, y, cv)
        best_models[model_name] = best_model
        
        # Quick validation on full dataset
        cv_scores = cross_val_score(best_model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        print(f"{model_name} CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    return best_models

def optimize_xgboost(X, y, cv):
    """Optimize XGBoost hyperparameters with Optuna."""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = XGBClassifier(**params)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        return scores.mean()
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    print(f"Best XGBoost Params: {study.best_params}")
    print(f"Best XGBoost CV Accuracy: {study.best_value:.4f}")
    
    return XGBClassifier(**study.best_params, n_jobs=-1)

def optimize_lightgbm(X, y, cv):
    """Optimize LightGBM hyperparameters with Optuna."""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', -1, 15),
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = LGBMClassifier(**params)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        return scores.mean()
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    print(f"Best LightGBM Params: {study.best_params}")
    print(f"Best LightGBM CV Accuracy: {study.best_value:.4f}")
    
    return LGBMClassifier(**study.best_params, n_jobs=-1)

def optimize_catboost(X, y, cv):
    """Optimize CatBoost hyperparameters with Optuna."""
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
            'random_seed': 42,
            'thread_count': -1,
            'verbose': False
        }
        
        model = CatBoostClassifier(**params)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        return scores.mean()
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=30)  # Fewer trials since CatBoost is slower
    print(f"Best CatBoost Params: {study.best_params}")
    print(f"Best CatBoost CV Accuracy: {study.best_value:.4f}")
    
    return CatBoostClassifier(**study.best_params, thread_count=-1, verbose=False)

def build_ensemble(best_models, X, y):
    """Build advanced ensemble models using optimized base models."""
    
    # Create a copy of best models to avoid modification
    models = {name: model for name, model in best_models.items()}
    
    # Add additional models for diversity
    models['RandomForest'] = RandomForestClassifier(
        n_estimators=500, max_depth=None, min_samples_split=2, max_features='sqrt', 
        n_jobs=-1, random_state=42
    )
    
    models['GradientBoosting'] = GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8,
        random_state=42
    )
    
    # Train additional models
    for name in ['RandomForest', 'GradientBoosting']:
        models[name].fit(X, y)
    
    # Create voting ensemble
    voting_clf = VotingClassifier(
        estimators=[(name, model) for name, model in models.items()],
        voting='soft'
    )
    voting_clf.fit(X, y)
    
    # Create stacking ensemble
    stacking_estimators = [(name, model) for name, model in models.items()]
    stacking_clf = StackingClassifier(
        estimators=stacking_estimators,
        final_estimator=LGBMClassifier(random_state=42),
        cv=5
    )
    stacking_clf.fit(X, y)
    
    # Create a meta-ensemble combining voting and stacking
    final_ensemble = VotingClassifier(
        estimators=[
            ('voting', voting_clf),
            ('stacking', stacking_clf),
            ('xgb', models['XGBoost']),
            ('lgbm', models['LightGBM']),
            ('catboost', models['CatBoost'])
        ],
        voting='soft'
    )
    final_ensemble.fit(X, y)
    
    return final_ensemble

# ----------------- Step 4: Training and Prediction -----------------

def run_training_pipeline():
    """Run the complete training pipeline."""
    
    print("\n=== Starting Training Pipeline ===\n")
    
    # Load data
    train_df, test_df = load_data()
    
    # Feature engineering
    print("\n=== Applying Feature Engineering ===\n")
    X_train, X_test, y_train, test_obs = engineer_features(train_df, test_df)
    
    print(f"Final training data shape: {X_train.shape}")
    print(f"Final test data shape: {X_test.shape}")
    
    # Handle class imbalance with advanced resampling
    print("\n=== Handling Class Imbalance ===\n")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    
    print(f"Class distribution after resampling: {np.bincount(y_resampled)}")
    
    # Optimize models
    print("\n=== Optimizing Base Models ===\n")
    best_models = optimize_models(X_resampled, y_resampled)
    
    # Build ensemble
    print("\n=== Building Ensemble Model ===\n")
    final_model = build_ensemble(best_models, X_resampled, y_resampled)
    
    # Save model for future use
    joblib.dump(final_model, 'final_salary_model.joblib')
    
    # Generate predictions
    print("\n=== Generating Predictions ===\n")
    predictions = final_model.predict(X_test)
    
    # Map predictions back to categories
    reverse_mapping = {2: 'High', 1: 'Medium', 0: 'Low'}
    predictions_labels = np.array([reverse_mapping[p] for p in predictions])
    
    # Create submission
    submission = pd.DataFrame({
        'obs': test_obs,
        'salary_category': predictions_labels
    })
    
    # Save submission
    submission.to_csv('solution_format_optimized.csv', index=False)
    print("Submission saved to solution_format_optimized.csv")
    
    # Report prediction distribution
    print("\n=== Prediction Distribution ===\n")
    print(pd.Series(predictions_labels).value_counts())

if __name__ == "__main__":
    run_training_pipeline()


=== Starting Training Pipeline ===

Train data shape: (1280, 317)
Test data shape: (854, 316)

=== Applying Feature Engineering ===



  File "c:\Users\damod\anaconda3\envs\nova\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\damod\anaconda3\envs\nova\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\damod\anaconda3\envs\nova\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\d

Final training data shape: (1280, 207)
Final test data shape: (854, 207)

=== Handling Class Imbalance ===



[I 2025-04-23 12:00:31,272] A new study created in memory with name: no-name-1c4732d5-cae1-47a5-91ba-e1bfcd549fb1


Class distribution after resampling: [470 464 467]

=== Optimizing Base Models ===


Optimizing XGBoost...


[I 2025-04-23 12:00:49,691] Trial 0 finished with value: 0.7851550584646672 and parameters: {'n_estimators': 206, 'max_depth': 12, 'learning_rate': 0.01386483003497694, 'subsample': 0.5189482620995902, 'colsample_bytree': 0.7771534734245694, 'min_child_weight': 3, 'reg_alpha': 0.46879626630401383, 'reg_lambda': 0.0023964327775773207, 'gamma': 0.007263606880431229}. Best is trial 0 with value: 0.7851550584646672.
[I 2025-04-23 12:00:58,300] Trial 1 finished with value: 0.7865760040671074 and parameters: {'n_estimators': 703, 'max_depth': 4, 'learning_rate': 0.022257928168816395, 'subsample': 0.7025072396545351, 'colsample_bytree': 0.5721248479578469, 'min_child_weight': 5, 'reg_alpha': 1.0600698762731224e-06, 'reg_lambda': 7.688387441989346e-07, 'gamma': 4.378501771997995e-06}. Best is trial 1 with value: 0.7865760040671074.
[I 2025-04-23 12:01:08,773] Trial 2 finished with value: 0.7930045754956787 and parameters: {'n_estimators': 525, 'max_depth': 7, 'learning_rate': 0.010139193489889

Best XGBoost Params: {'n_estimators': 416, 'max_depth': 10, 'learning_rate': 0.021395966973405785, 'subsample': 0.6067322712431702, 'colsample_bytree': 0.6814542503368989, 'min_child_weight': 1, 'reg_alpha': 5.218669208291581e-07, 'reg_lambda': 1.3167441279030621e-05, 'gamma': 0.00029183970354077605}
Best XGBoost CV Accuracy: 0.8108


[I 2025-04-23 12:07:39,348] A new study created in memory with name: no-name-0a51a764-c341-4622-a299-d8ae79168d22


XGBoost CV Accuracy: 0.8066 ± 0.0215

Optimizing LightGBM...


[I 2025-04-23 12:07:43,821] Trial 0 finished with value: 0.795866802236909 and parameters: {'n_estimators': 153, 'max_depth': 9, 'num_leaves': 24, 'learning_rate': 0.08863913198207701, 'subsample': 0.8699591976712415, 'colsample_bytree': 0.9622912859361867, 'min_child_samples': 7, 'reg_alpha': 0.00010497486895947389, 'reg_lambda': 2.0716312962197775e-08}. Best is trial 0 with value: 0.795866802236909.
[I 2025-04-23 12:07:48,863] Trial 1 finished with value: 0.7666065073716319 and parameters: {'n_estimators': 423, 'max_depth': 14, 'num_leaves': 69, 'learning_rate': 0.16197990460426764, 'subsample': 0.8024319232388913, 'colsample_bytree': 0.9794666826475241, 'min_child_samples': 85, 'reg_alpha': 5.484729135755418e-08, 'reg_lambda': 2.3250611934612123e-07}. Best is trial 0 with value: 0.795866802236909.
[I 2025-04-23 12:07:54,410] Trial 2 finished with value: 0.8044280630401627 and parameters: {'n_estimators': 791, 'max_depth': -1, 'num_leaves': 42, 'learning_rate': 0.14090240224969738, '

Best LightGBM Params: {'n_estimators': 924, 'max_depth': -1, 'num_leaves': 88, 'learning_rate': 0.0365044652388992, 'subsample': 0.6821995349890481, 'colsample_bytree': 0.860825638749719, 'min_child_samples': 13, 'reg_alpha': 0.0001956541555372545, 'reg_lambda': 0.0993986288090497}
Best LightGBM CV Accuracy: 0.8094


[I 2025-04-23 12:17:50,362] A new study created in memory with name: no-name-9b41259c-cef4-4831-89e7-0716af16db24


LightGBM CV Accuracy: 0.8087 ± 0.0268

Optimizing CatBoost...


[I 2025-04-23 12:18:48,354] Trial 0 finished with value: 0.7708718861209964 and parameters: {'iterations': 999, 'depth': 8, 'learning_rate': 0.021221197565161665, 'l2_leaf_reg': 8.566338695147854e-07, 'random_strength': 0.001135744353318157, 'border_count': 80, 'bagging_temperature': 9.106355346448213}. Best is trial 0 with value: 0.7708718861209964.
[I 2025-04-23 12:21:37,336] Trial 1 finished with value: 0.7780249110320284 and parameters: {'iterations': 990, 'depth': 10, 'learning_rate': 0.09114802706071358, 'l2_leaf_reg': 6.639059738604977e-08, 'random_strength': 1.7217305987197302e-08, 'border_count': 39, 'bagging_temperature': 5.614674693773434}. Best is trial 1 with value: 0.7780249110320284.
[I 2025-04-23 12:27:34,354] Trial 2 finished with value: 0.7958566344687341 and parameters: {'iterations': 554, 'depth': 10, 'learning_rate': 0.023932204964302407, 'l2_leaf_reg': 0.5558911593918398, 'random_strength': 0.26957953519826655, 'border_count': 185, 'bagging_temperature': 1.0663331

Best CatBoost Params: {'iterations': 867, 'depth': 6, 'learning_rate': 0.03590896122595367, 'l2_leaf_reg': 0.001313516569453832, 'random_strength': 0.005522769745566168, 'border_count': 222, 'bagging_temperature': 0.04363932017906991}
Best CatBoost CV Accuracy: 0.8130
CatBoost CV Accuracy: 0.8116 ± 0.0158

=== Building Ensemble Model ===

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16349
[LightGBM] [Info] Number of data points in the train set: 1401, number of used features: 162
[LightGBM] [Info] Start training from score -1.092209
[LightGBM] [Info] Start training from score -1.105057
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16349
[LightGBM]

ValueError: Estimator names conflict with constructor arguments: ['voting']