In [1]:
# Fix for wmic error in Windows
import os
os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())
print(f"Setting max CPU count to: {os.environ['LOKY_MAX_CPU_COUNT']}")

# For older joblib versions, you might also need:
os.environ["JOBLIB_TEMP_FOLDER"] = os.path.join(os.path.expanduser("~"), "temp_joblib")
if not os.path.exists(os.environ["JOBLIB_TEMP_FOLDER"]):
    os.makedirs(os.environ["JOBLIB_TEMP_FOLDER"])

Setting max CPU count to: 20


In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn optuna xgboost lightgbm catboost imbalanced-learn category_encoders joblib

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import StackingClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from category_encoders import TargetEncoder, CatBoostEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings
import re
import os
from datetime import datetime
import joblib

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
np.random.seed(42)

# Create output directories
def create_directories():
    """Create directories for saving models, features, results"""
    directories = ['models', 'features', 'results', 'submissions']
    
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")

# Get timestamp for unique filenames
def get_timestamp():
    """Get a formatted timestamp for filenames"""
    return datetime.now().strftime("%Y%m%d_%H%M%S")

# ----------------- Step 1: Load and Prepare Data with Better Error Handling -----------------

def load_data():
    """Load datasets with enhanced validation and handling of edge cases."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        # Check for required columns
        expected_cols = {'obs', 'job_title', 'job_posted_date', 'job_state', 'feature_1'}
        if not expected_cols.issubset(train_df.columns):
            raise ValueError(f"Essential columns missing: {expected_cols - set(train_df.columns)}")
        
        # Ensure train data has target column
        if 'salary_category' not in train_df.columns:
            raise ValueError("Training data is missing target column 'salary_category'")
        
        # Check for consistent data types between train and test
        for col in test_df.columns:
            if col in train_df.columns and train_df[col].dtype != test_df[col].dtype:
                print(f"Warning: Column {col} has different types in train and test. Converting to consistent type.")
                # Try to make types consistent
                if pd.api.types.is_numeric_dtype(train_df[col]):
                    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
                else:
                    test_df[col] = test_df[col].astype(str)
        
        print(f"Train data shape: {train_df.shape}")
        print(f"Test data shape: {test_df.shape}")
        
        return train_df, test_df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

# ----------------- Step 2: Enhanced Feature Engineering -----------------

def engineer_features(train_df, test_df, timestamp, is_training=True):
    """Apply comprehensive feature engineering with better handling of edge cases."""
    
    # Make copies to avoid modifying original
    X_train = train_df.copy()
    X_test = test_df.copy()
    
    # Extract target if training
    if is_training and 'salary_category' in X_train.columns:
        y = X_train['salary_category'].copy()
        # Map categories to numerical values
        label_mapping = {'High': 2, 'Medium': 1, 'Low': 0}
        y = y.map(label_mapping)
        X_train = X_train.drop(columns=['salary_category'])
    else:
        y = None
    
    # Remove obs column
    if 'obs' in X_train.columns:
        X_train = X_train.drop(columns=['obs'])
    if 'obs' in X_test.columns:
        test_obs = X_test['obs'].copy()
        X_test = X_test.drop(columns=['obs'])
    else:
        test_obs = None
    
    # Define feature types
    numerical_cols = ['feature_2', 'feature_9', 'feature_12']
    categorical_cols = ['job_title', 'job_state', 'feature_1']
    boolean_cols = ['feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 
                   'feature_8', 'feature_10', 'feature_11']
    job_desc_cols = [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]
    date_cols = ['job_posted_date']
    
    # ----- Handle missing values -----
    
    # For numerical columns
    num_imputer = KNNImputer(n_neighbors=5)
    for col in numerical_cols:
        if col in X_train.columns:
            # Convert to numeric first, handling non-numeric values
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
            
            # Reshape for imputer
            X_train_reshaped = X_train[col].values.reshape(-1, 1)
            X_train[col] = num_imputer.fit_transform(X_train_reshaped).ravel()
            
            X_test_reshaped = X_test[col].values.reshape(-1, 1)
            X_test[col] = num_imputer.transform(X_test_reshaped).ravel()
    
    # For categorical columns
    for col in categorical_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna('Unknown')
            X_test[col] = X_test[col].fillna('Unknown')
    
    # For boolean columns
    for col in boolean_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna(0).astype(int)
            X_test[col] = X_test[col].fillna(0).astype(int)
    
    # For job description columns - better handling of sparse feature matrices
    for col in job_desc_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].fillna(0)
            X_test[col] = X_test[col].fillna(0)
    
    # ----- Job Title Feature Extraction -----
    
    if 'job_title' in X_train.columns:
        # Extract seniority level
        X_train['is_senior'] = X_train['job_title'].str.contains(
            'senior|sr|lead|principal|staff|architect|head', case=False).astype(int)
        X_test['is_senior'] = X_test['job_title'].str.contains(
            'senior|sr|lead|principal|staff|architect|head', case=False).astype(int)
        
        X_train['is_junior'] = X_train['job_title'].str.contains(
            'junior|jr|associate|entry|intern', case=False).astype(int)
        X_test['is_junior'] = X_test['job_title'].str.contains(
            'junior|jr|associate|entry|intern', case=False).astype(int)
        
        # Extract role type
        X_train['is_manager'] = X_train['job_title'].str.contains(
            'manager|director|lead|head', case=False).astype(int)
        X_test['is_manager'] = X_test['job_title'].str.contains(
            'manager|director|lead|head', case=False).astype(int)
        
        X_train['is_developer'] = X_train['job_title'].str.contains(
            'developer|engineer|programmer|coder', case=False).astype(int)
        X_test['is_developer'] = X_test['job_title'].str.contains(
            'developer|engineer|programmer|coder', case=False).astype(int)
        
        X_train['is_data'] = X_train['job_title'].str.contains(
            'data|scientist|analyst|analytics', case=False).astype(int)
        X_test['is_data'] = X_test['job_title'].str.contains(
            'data|scientist|analyst|analytics', case=False).astype(int)
        
        # Target encoding for job title
        if is_training:
            title_encoder = TargetEncoder()
            X_train['job_title_encoded'] = title_encoder.fit_transform(X_train[['job_title']], y)
            # Save encoder for test predictions
            joblib.dump(title_encoder, f'features/job_title_encoder_{timestamp}.joblib')
        else:
            # Find the most recent encoder file
            encoder_files = sorted([f for f in os.listdir('features') if 'job_title_encoder' in f])
            if encoder_files:
                title_encoder = joblib.load(f'features/{encoder_files[-1]}')
                X_test['job_title_encoded'] = title_encoder.transform(X_test[['job_title']])
            else:
                print("Warning: Job title encoder not found. Using dummy values.")
                X_test['job_title_encoded'] = 0.5
        
        # Also use CatBoost encoder for job title
        if is_training:
            cb_encoder = CatBoostEncoder()
            X_train['job_title_cb_encoded'] = cb_encoder.fit_transform(X_train[['job_title']], y)
            joblib.dump(cb_encoder, f'features/job_title_cb_encoder_{timestamp}.joblib')
        else:
            encoder_files = sorted([f for f in os.listdir('features') if 'job_title_cb_encoder' in f])
            if encoder_files:
                cb_encoder = joblib.load(f'features/{encoder_files[-1]}')
                X_test['job_title_cb_encoded'] = cb_encoder.transform(X_test[['job_title']])
            else:
                print("Warning: CatBoost encoder not found. Using dummy values.")
                X_test['job_title_cb_encoded'] = 0.5
    
    # ----- Date Feature Engineering -----
    
    if 'job_posted_date' in X_train.columns:
        # Handle date format with robust error handling
        def parse_date(date_str):
            try:
                if pd.isna(date_str):
                    return pd.Timestamp('2020-01-01')
                return pd.to_datetime(date_str, format='%Y/%m')
            except:
                return pd.Timestamp('2020-01-01')
        
        X_train['parsed_date'] = X_train['job_posted_date'].apply(parse_date)
        X_test['parsed_date'] = X_test['job_posted_date'].apply(parse_date)
        
        # Extract year and month
        X_train['job_posted_year'] = X_train['parsed_date'].dt.year
        X_train['job_posted_month'] = X_train['parsed_date'].dt.month
        X_test['job_posted_year'] = X_test['parsed_date'].dt.year
        X_test['job_posted_month'] = X_test['parsed_date'].dt.month
        
        # Create cyclical features for month
        X_train['month_sin'] = np.sin(2 * np.pi * X_train['job_posted_month']/12)
        X_train['month_cos'] = np.cos(2 * np.pi * X_train['job_posted_month']/12)
        X_test['month_sin'] = np.sin(2 * np.pi * X_test['job_posted_month']/12)
        X_test['month_cos'] = np.cos(2 * np.pi * X_test['job_posted_month']/12)
        
        # Create quarter feature
        X_train['job_quarter'] = ((X_train['job_posted_month'] - 1) // 3) + 1
        X_test['job_quarter'] = ((X_test['job_posted_month'] - 1) // 3) + 1
        
        # Drop intermediate columns
        X_train = X_train.drop(columns=['parsed_date', 'job_posted_date'])
        X_test = X_test.drop(columns=['parsed_date', 'job_posted_date'])
    
    # ----- Geographic Features -----
    
    # Define regions
    us_regions = {
        'Northeast': ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA'],
        'Midwest': ['OH', 'MI', 'IN', 'IL', 'WI', 'MN', 'IA', 'MO', 'ND', 'SD', 'NE', 'KS'],
        'South': ['DE', 'MD', 'DC', 'VA', 'WV', 'NC', 'SC', 'GA', 'FL', 'KY', 'TN', 'AL', 'MS', 'AR', 'LA', 'OK', 'TX'],
        'West': ['MT', 'ID', 'WY', 'CO', 'NM', 'AZ', 'UT', 'NV', 'CA', 'OR', 'WA', 'AK', 'HI']
    }
    
    if 'job_state' in X_train.columns:
        # Create region features
        for region, states in us_regions.items():
            X_train[f'region_{region}'] = X_train['job_state'].apply(lambda x: 1 if x in states else 0)
            X_test[f'region_{region}'] = X_test['job_state'].apply(lambda x: 1 if x in states else 0)
        
        # One-hot encode job_state
        if is_training:
            state_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            state_encoded = state_encoder.fit_transform(X_train[['job_state']])
            joblib.dump(state_encoder, f'features/state_encoder_{timestamp}.joblib')
        else:
            encoder_files = sorted([f for f in os.listdir('features') if 'state_encoder' in f])
            if encoder_files:
                state_encoder = joblib.load(f'features/{encoder_files[-1]}')
                state_encoded = state_encoder.transform(X_test[['job_state']])
            else:
                print("Warning: State encoder not found.")
                state_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                state_encoded = state_encoder.fit_transform(X_test[['job_state']])
        
        state_cols = [f'state_{col}' for col in state_encoder.get_feature_names_out(['job_state'])]
        if is_training:
            X_train_state = pd.DataFrame(state_encoded, columns=state_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_state], axis=1)
        else:
            X_test_state = pd.DataFrame(state_encoded, columns=state_cols, index=X_test.index)
            X_test = pd.concat([X_test, X_test_state], axis=1)
    
    # ----- Feature_1 Encoding -----
    
    if 'feature_1' in X_train.columns:
        if is_training:
            feat1_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            feat1_encoded = feat1_encoder.fit_transform(X_train[['feature_1']])
            joblib.dump(feat1_encoder, f'features/feat1_encoder_{timestamp}.joblib')
        else:
            encoder_files = sorted([f for f in os.listdir('features') if 'feat1_encoder' in f])
            if encoder_files:
                feat1_encoder = joblib.load(f'features/{encoder_files[-1]}')
                feat1_encoded = feat1_encoder.transform(X_test[['feature_1']])
            else:
                print("Warning: Feature_1 encoder not found.")
                feat1_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                feat1_encoded = feat1_encoder.fit_transform(X_test[['feature_1']])
        
        feat1_cols = [f'feat1_{col}' for col in feat1_encoder.get_feature_names_out(['feature_1'])]
        if is_training:
            X_train_feat1 = pd.DataFrame(feat1_encoded, columns=feat1_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_feat1], axis=1)
        else:
            X_test_feat1 = pd.DataFrame(feat1_encoded, columns=feat1_cols, index=X_test.index)
            X_test = pd.concat([X_test, X_test_feat1], axis=1)
    
    # ----- Numerical Feature Transformations -----
    
    # Apply power transformer for skewed numerical features
    if is_training:
        for col in numerical_cols:
            if col in X_train.columns:
                power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
                X_train[col] = power_transformer.fit_transform(X_train[[col]])
                joblib.dump(power_transformer, f'features/{col}_transformer_{timestamp}.joblib')
    else:
        for col in numerical_cols:
            if col in X_test.columns:
                transformer_files = sorted([f for f in os.listdir('features') if f'{col}_transformer' in f])
                if transformer_files:
                    power_transformer = joblib.load(f'features/{transformer_files[-1]}')
                    X_test[col] = power_transformer.transform(X_test[[col]])
                else:
                    print(f"Warning: Transformer for {col} not found.")
    
    # Create polynomial features
    for col in numerical_cols:
        if col in X_train.columns:
            X_train[f'{col}_squared'] = X_train[col] ** 2
            X_train[f'{col}_cubed'] = X_train[col] ** 3
            X_test[f'{col}_squared'] = X_test[col] ** 2
            X_test[f'{col}_cubed'] = X_test[col] ** 3
    
    # Feature interactions between numerical features
    for i, col1 in enumerate(numerical_cols):
        for j, col2 in enumerate(numerical_cols):
            if i < j and col1 in X_train.columns and col2 in X_train.columns:
                X_train[f'{col1}_{col2}_interaction'] = X_train[col1] * X_train[col2]
                X_test[f'{col1}_{col2}_interaction'] = X_test[col1] * X_test[col2]
    
    # ----- Boolean Feature Engineering -----
    
    # Sum of boolean features
    if all(col in X_train.columns for col in boolean_cols):
        X_train['boolean_sum'] = X_train[boolean_cols].sum(axis=1)
        X_test['boolean_sum'] = X_test[boolean_cols].sum(axis=1)
        
        # Interactions between boolean features
        for i, col1 in enumerate(boolean_cols):
            for j, col2 in enumerate(boolean_cols):
                if i < j:
                    # AND interaction
                    X_train[f'{col1}_{col2}_and'] = X_train[col1] & X_train[col2]
                    X_test[f'{col1}_{col2}_and'] = X_test[col1] & X_test[col2]
                    
                    # OR interaction
                    X_train[f'{col1}_{col2}_or'] = X_train[col1] | X_train[col2]
                    X_test[f'{col1}_{col2}_or'] = X_test[col1] | X_test[col2]
    
    # ----- Job Description Feature Engineering -----
    # Check if job_desc columns exist
    if all(col in X_train.columns for col in job_desc_cols[:10]):  # Check first 10 columns
        # Calculate aggregate statistics for job descriptions
        X_train['job_desc_mean'] = X_train[job_desc_cols].mean(axis=1)
        X_train['job_desc_std'] = X_train[job_desc_cols].std(axis=1)
        X_train['job_desc_sum'] = X_train[job_desc_cols].sum(axis=1)
        X_train['job_desc_min'] = X_train[job_desc_cols].min(axis=1)
        X_train['job_desc_max'] = X_train[job_desc_cols].max(axis=1)
        X_train['job_desc_nonzero'] = (X_train[job_desc_cols] != 0).sum(axis=1)
        
        X_test['job_desc_mean'] = X_test[job_desc_cols].mean(axis=1)
        X_test['job_desc_std'] = X_test[job_desc_cols].std(axis=1)
        X_test['job_desc_sum'] = X_test[job_desc_cols].sum(axis=1)
        X_test['job_desc_min'] = X_test[job_desc_cols].min(axis=1)
        X_test['job_desc_max'] = X_test[job_desc_cols].max(axis=1)
        X_test['job_desc_nonzero'] = (X_test[job_desc_cols] != 0).sum(axis=1)
        
        # Advanced dimensionality reduction
        if is_training:
            # PCA for dense representation
            pca = PCA(n_components=25)
            job_desc_pca_train = pca.fit_transform(X_train[job_desc_cols])
            joblib.dump(pca, f'features/job_desc_pca_{timestamp}.joblib')
            
            # SVD for sparse representation
            svd = TruncatedSVD(n_components=25)
            job_desc_svd_train = svd.fit_transform(X_train[job_desc_cols])
            joblib.dump(svd, f'features/job_desc_svd_{timestamp}.joblib')
            
            # Add dimension reduction features to training data
            for i in range(25):
                X_train[f'pca_{i}'] = job_desc_pca_train[:, i]
                X_train[f'svd_{i}'] = job_desc_svd_train[:, i]
                
            # Clustering on job descriptions
            kmeans = KMeans(n_clusters=15, random_state=42, n_init=10)
            X_train['job_desc_cluster'] = kmeans.fit_predict(job_desc_pca_train)
            joblib.dump(kmeans, f'features/job_desc_kmeans_{timestamp}.joblib')
            
            # Create one-hot encoding for clusters
            cluster_encoder = OneHotEncoder(sparse_output=False)
            cluster_encoded_train = cluster_encoder.fit_transform(X_train[['job_desc_cluster']])
            joblib.dump(cluster_encoder, f'features/cluster_encoder_{timestamp}.joblib')
            
            cluster_cols = [f'cluster_{i}' for i in range(cluster_encoded_train.shape[1])]
            X_train_clusters = pd.DataFrame(cluster_encoded_train, columns=cluster_cols, index=X_train.index)
            X_train = pd.concat([X_train, X_train_clusters], axis=1)
            
        else:
            # Apply transformations to test data separately
            pca_files = sorted([f for f in os.listdir('features') if 'job_desc_pca' in f])
            svd_files = sorted([f for f in os.listdir('features') if 'job_desc_svd' in f])
            kmeans_files = sorted([f for f in os.listdir('features') if 'job_desc_kmeans' in f])
            cluster_files = sorted([f for f in os.listdir('features') if 'cluster_encoder' in f])
            
            if pca_files:
                pca = joblib.load(f'features/{pca_files[-1]}')
                job_desc_pca_test = pca.transform(X_test[job_desc_cols])
            else:
                print("Warning: PCA model not found.")
                job_desc_pca_test = np.zeros((X_test.shape[0], 25))
            
            if svd_files:
                svd = joblib.load(f'features/{svd_files[-1]}')
                job_desc_svd_test = svd.transform(X_test[job_desc_cols])
            else:
                print("Warning: SVD model not found.")
                job_desc_svd_test = np.zeros((X_test.shape[0], 25))
            
            # Add dimension reduction features to test data
            for i in range(25):
                X_test[f'pca_{i}'] = job_desc_pca_test[:, i]
                X_test[f'svd_{i}'] = job_desc_svd_test[:, i]
            
            # Apply clustering to test data
            if kmeans_files:
                kmeans = joblib.load(f'features/{kmeans_files[-1]}')
                X_test['job_desc_cluster'] = kmeans.predict(job_desc_pca_test)
            else:
                print("Warning: KMeans model not found.")
                X_test['job_desc_cluster'] = 0
            
            # Apply one-hot encoding to test clusters
            if cluster_files:
                cluster_encoder = joblib.load(f'features/{cluster_files[-1]}')
                cluster_encoded_test = cluster_encoder.transform(X_test[['job_desc_cluster']])
                
                cluster_cols = [f'cluster_{i}' for i in range(cluster_encoded_test.shape[1])]
                X_test_clusters = pd.DataFrame(cluster_encoded_test, columns=cluster_cols, index=X_test.index)
                X_test = pd.concat([X_test, X_test_clusters], axis=1)
        
        # Drop original job description columns to reduce dimensionality
        X_train = X_train.drop(columns=job_desc_cols)
        X_test = X_test.drop(columns=job_desc_cols)

    
    # Drop original categorical columns now that they're encoded
    cols_to_drop = ['job_title', 'job_state', 'feature_1']
    X_train = X_train.drop(columns=[c for c in cols_to_drop if c in X_train.columns])
    X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])
    
    # Ensure all columns in test exist in train
    for col in X_train.columns:
        if col not in X_test.columns:
            X_test[col] = 0
    
    # Ensure all columns in train exist in test
    for col in X_test.columns:
        if col not in X_train.columns:
            X_train[col] = 0
    
    # Make sure columns are in the same order
    X_test = X_test[X_train.columns]
    
    # Save feature data
    if is_training:
        X_train.to_csv(f'features/X_train_features_{timestamp}.csv', index=False)
        if y is not None:
            pd.Series(y).to_csv(f'features/y_train_{timestamp}.csv', index=False)
    
    # Return prepared datasets
    if is_training:
        return X_train, X_test, y, test_obs
    else:
        return X_train, X_test, None, test_obs

# ----------------- Step 3: Advanced Model Building -----------------

def optimize_models(X, y, timestamp):
    """Optimize multiple models with Optuna and proper cross-validation."""
    
    # Initialize stratified k-fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    best_models = {}
    
    # Define models to optimize
    models_to_optimize = {
        'XGBoost': optimize_xgboost,
        'CatBoost': optimize_catboost
    }
    
    # Run optimization for each model
    for model_name, optimizer_func in models_to_optimize.items():
        print(f"\nOptimizing {model_name}...")
        best_model = optimizer_func(X, y, cv)
        best_models[model_name] = best_model
        
        # Save the optimized model
        joblib.dump(best_model, f'models/{model_name}_optimized_{timestamp}.joblib')
        
        # Quick validation on full dataset
        cv_scores = cross_val_score(best_model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        print(f"{model_name} CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        
        # Save model performance
        with open(f'results/model_performance_{timestamp}.txt', 'a') as f:
            f.write(f"{model_name} CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}\n")
    
    return best_models

def optimize_xgboost(X, y, cv):
    """Optimize XGBoost hyperparameters with Optuna."""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = XGBClassifier(**params)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        return scores.mean()
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    print(f"Best XGBoost Params: {study.best_params}")
    print(f"Best XGBoost CV Accuracy: {study.best_value:.4f}")
    
    return XGBClassifier(**study.best_params, n_jobs=-1)

def optimize_catboost(X, y, cv):
    """Optimize CatBoost hyperparameters with Optuna."""
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
            'random_seed': 42,
            'thread_count': -1,
            'verbose': False
        }
        
        model = CatBoostClassifier(**params)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        return scores.mean()
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=30)  # Fewer trials since CatBoost is slower
    print(f"Best CatBoost Params: {study.best_params}")
    print(f"Best CatBoost CV Accuracy: {study.best_value:.4f}")
    
    return CatBoostClassifier(**study.best_params, thread_count=-1, verbose=False)

def build_ensemble(best_models, X, y, timestamp):
    """Build advanced ensemble models using optimized base models."""
    
    # Create a copy of best models to avoid modification
    models = {name: model for name, model in best_models.items()}
    
    # Add additional models for diversity
    models['RandomForest'] = RandomForestClassifier(
        n_estimators=500, max_depth=None, min_samples_split=2, max_features='sqrt', 
        n_jobs=-1, random_state=42
    )
    
    models['GradientBoosting'] = GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8,
        random_state=42
    )
    
    # Train additional models
    for name in ['RandomForest', 'GradientBoosting']:
        models[name].fit(X, y)
        joblib.dump(models[name], f'models/{name}_model_{timestamp}.joblib')
    
    # Create voting ensemble - fix naming conflict
    voting_clf = VotingClassifier(
        estimators=[
            ('rf', models['RandomForest']),
            ('gb', models['GradientBoosting']),
            ('xgb', models['XGBoost']),
            ('catb', models['CatBoost'])
        ],
        voting='soft'
    )
    voting_clf.fit(X, y)
    joblib.dump(voting_clf, f'models/voting_ensemble_{timestamp}.joblib')
    
    # Create stacking ensemble
    stacking_estimators = [
        ('rf', models['RandomForest']),
        ('gb', models['GradientBoosting']),
        ('xgb', models['XGBoost']),
        ('catb', models['CatBoost'])
    ]
    
    stacking_clf = StackingClassifier(
        estimators=stacking_estimators,
        final_estimator=RandomForestClassifier(random_state=42),
        cv=5
    )
    stacking_clf.fit(X, y)
    joblib.dump(stacking_clf, f'models/stacking_ensemble_{timestamp}.joblib')
    
    # Create a meta-ensemble combining models - fix naming conflict
    final_ensemble = VotingClassifier(
        estimators=[
            ('vote', voting_clf),
            ('stack', stacking_clf),
            ('xgb', models['XGBoost']),
            ('catb', models['CatBoost'])
        ],
        voting='soft'
    )
    final_ensemble.fit(X, y)
    
    # Save the final ensemble model
    joblib.dump(final_ensemble, f'models/final_ensemble_{timestamp}.joblib')
    
    return final_ensemble

# ----------------- Step 4: Training and Prediction -----------------

def run_training_pipeline():
    """Run the complete training pipeline with proper file management"""
    
    # Create timestamp for unique filenames
    timestamp = get_timestamp()
    print(f"Run timestamp: {timestamp}")
    
    # Create directories if they don't exist
    create_directories()
    
    print("\n=== Starting Training Pipeline ===\n")
    
    # Create a log file for this run
    with open(f'results/training_log_{timestamp}.txt', 'w') as log_file:
        log_file.write(f"Training Run: {timestamp}\n\n")
        log_file.write("=== Starting Training Pipeline ===\n\n")
    
    # Load data
    train_df, test_df = load_data()
    
    # Feature engineering
    print("\n=== Applying Feature Engineering ===\n")
    X_train, X_test, y_train, test_obs = engineer_features(train_df, test_df, timestamp)
    
    print(f"Final training data shape: {X_train.shape}")
    print(f"Final test data shape: {X_test.shape}")
    
    # Save shapes to log
    with open(f'results/training_log_{timestamp}.txt', 'a') as log_file:
        log_file.write(f"Training data shape: {X_train.shape}\n")
        log_file.write(f"Test data shape: {X_test.shape}\n\n")
    
    # Handle class imbalance with advanced resampling
    print("\n=== Handling Class Imbalance ===\n")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
    
    print(f"Class distribution after resampling: {np.bincount(y_resampled)}")
    
    # Save resampling info to log
    with open(f'results/training_log_{timestamp}.txt', 'a') as log_file:
        log_file.write(f"Class distribution after resampling: {np.bincount(y_resampled)}\n\n")
    
    # Optimize models
    print("\n=== Optimizing Base Models ===\n")
    best_models = optimize_models(X_resampled, y_resampled, timestamp)
    
    # Build ensemble
    print("\n=== Building Ensemble Model ===\n")
    final_model = build_ensemble(best_models, X_resampled, y_resampled, timestamp)
    
    # Save model for future use
    joblib.dump(final_model, f'models/final_salary_model_{timestamp}.joblib')
    
    # Generate predictions
    print("\n=== Generating Predictions ===\n")
    predictions = final_model.predict(X_test)
    
    # Map predictions back to categories
    reverse_mapping = {2: 'High', 1: 'Medium', 0: 'Low'}
    predictions_labels = np.array([reverse_mapping[p] for p in predictions])
    
    # Create submission
    submission = pd.DataFrame({
        'obs': test_obs,
        'salary_category': predictions_labels
    })
    
    # Save submission
    submission_path = f'submissions/solution_format_{timestamp}.csv'
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")
    
    # Report prediction distribution
    print("\n=== Prediction Distribution ===\n")
    pred_distribution = pd.Series(predictions_labels).value_counts()
    print(pred_distribution)
    
    # Save results to log
    with open(f'results/training_log_{timestamp}.txt', 'a') as log_file:
        log_file.write("=== Prediction Distribution ===\n")
        log_file.write(f"{pred_distribution.to_string()}\n\n")
        log_file.write(f"Submission saved to {submission_path}\n")
    
    # Create a symlink to the latest model
    latest_model_path = 'models/latest_model.joblib'
    if os.path.exists(latest_model_path):
        os.remove(latest_model_path)
    if os.name == 'nt':  # Windows
        # Windows doesn't support symbolic links easily, so we'll copy the file
        import shutil
        shutil.copy2(f'models/final_salary_model_{timestamp}.joblib', latest_model_path)
    else:
        # For Unix-based systems
        os.symlink(f'final_salary_model_{timestamp}.joblib', latest_model_path)
    
    print("\n=== Training Pipeline Complete ===\n")
    return final_model

if __name__ == "__main__":
    run_training_pipeline()

Run timestamp: 20250423_135441

=== Starting Training Pipeline ===

Train data shape: (1280, 317)
Test data shape: (854, 316)

=== Applying Feature Engineering ===



  File "c:\Users\damod\anaconda3\envs\nova\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\damod\anaconda3\envs\nova\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\damod\anaconda3\envs\nova\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\d

Final training data shape: (1280, 207)
Final test data shape: (854, 207)

=== Handling Class Imbalance ===

Class distribution after resampling: [470 464 467]

=== Optimizing Base Models ===


Optimizing XGBoost...


[I 2025-04-23 13:54:50,822] Trial 0 finished with value: 0.7851499745805796 and parameters: {'n_estimators': 740, 'max_depth': 7, 'learning_rate': 0.025618211059620664, 'subsample': 0.5680064509333211, 'colsample_bytree': 0.9559255099275796, 'min_child_weight': 9, 'reg_alpha': 3.0752800848207787e-07, 'reg_lambda': 0.5625395573328916, 'gamma': 0.05430582884404317}. Best is trial 0 with value: 0.7851499745805796.
[I 2025-04-23 13:54:53,528] Trial 1 finished with value: 0.7508973055414335 and parameters: {'n_estimators': 140, 'max_depth': 6, 'learning_rate': 0.011443443891037635, 'subsample': 0.8158059311399543, 'colsample_bytree': 0.7333592521164647, 'min_child_weight': 7, 'reg_alpha': 0.4657323358105676, 'reg_lambda': 0.3253777020594294, 'gamma': 3.257279575643104e-07}. Best is trial 0 with value: 0.7851499745805796.
[I 2025-04-23 13:55:04,163] Trial 2 finished with value: 0.7987214031520081 and parameters: {'n_estimators': 764, 'max_depth': 11, 'learning_rate': 0.02510626975005081, 'su

Best XGBoost Params: {'n_estimators': 501, 'max_depth': 10, 'learning_rate': 0.018749402591536447, 'subsample': 0.7162082327758323, 'colsample_bytree': 0.7151681219871792, 'min_child_weight': 1, 'reg_alpha': 2.711808432344372e-06, 'reg_lambda': 0.0478131139360345, 'gamma': 1.892745571057327e-07}
Best XGBoost CV Accuracy: 0.8051


[I 2025-04-23 14:00:37,252] A new study created in memory with name: no-name-69ccf530-5011-4329-9068-5106fc672b8d


XGBoost CV Accuracy: 0.7973 ± 0.0257

Optimizing CatBoost...


[W 2025-04-23 14:01:01,071] Trial 0 failed with parameters: {'iterations': 428, 'depth': 9, 'learning_rate': 0.010457773760883068, 'l2_leaf_reg': 0.0013078910949593458, 'random_strength': 4.140447401164749e-08, 'border_count': 129, 'bagging_temperature': 3.7361131394880873} because of the following error: The value nan is not acceptable.
[W 2025-04-23 14:01:01,071] Trial 0 failed with value np.float64(nan).
[I 2025-04-23 14:07:11,713] Trial 1 finished with value: 0.7822953736654805 and parameters: {'iterations': 867, 'depth': 9, 'learning_rate': 0.045106093960591495, 'l2_leaf_reg': 0.0017678706992571046, 'random_strength': 1.0837481483261338e-06, 'border_count': 223, 'bagging_temperature': 8.462377171995024}. Best is trial 1 with value: 0.7822953736654805.
[I 2025-04-23 14:08:25,636] Trial 2 finished with value: 0.8030071174377225 and parameters: {'iterations': 496, 'depth': 6, 'learning_rate': 0.03096979388426965, 'l2_leaf_reg': 2.2312798895384105e-06, 'random_strength': 3.03589674716

Best CatBoost Params: {'iterations': 844, 'depth': 7, 'learning_rate': 0.2188504393461133, 'l2_leaf_reg': 0.0007447295295401203, 'random_strength': 0.00027291656783447714, 'border_count': 77, 'bagging_temperature': 1.009698991167879}
Best CatBoost CV Accuracy: 0.8108
CatBoost CV Accuracy: 0.8080 ± 0.0173

=== Building Ensemble Model ===


=== Generating Predictions ===

Submission saved to submissions/solution_format_20250423_135441.csv

=== Prediction Distribution ===

Low       849
High        4
Medium      1
Name: count, dtype: int64

=== Training Pipeline Complete ===

