In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

def load_and_prepare_data():
    """Load and prepare the data"""
    # Load data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    
    # Separate features and target
    X = train_df.drop('salary', axis=1)
    y = train_df['salary']
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_val, y_train, y_val, X, y

def create_preprocessor():
    """Create preprocessing pipeline"""
    # Identify numerical and categorical columns
    numerical_cols = ['experience', 'education', 'company_size']  # Add your numerical columns
    categorical_cols = ['job_title', 'location', 'industry']  # Add your categorical columns
    
    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    return preprocessor




In [3]:

def train_models(X_train, X_val, y_train, y_val, preprocessor):
    """Train and evaluate multiple models"""
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'XGBoost': xgb.XGBRegressor(random_state=42)
    }
    
    results = {}
    best_model = None
    best_score = float('-inf')
    
    for name, model in models.items():
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_val)
        
        # Calculate metrics
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        
        # Store results
        results[name] = {
            'model': pipeline,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
        
        # Update best model
        if r2 > best_score:
            best_score = r2
            best_model = pipeline
    
    return results, best_model


In [4]:

def tune_xgboost(X, y, preprocessor):
    """Perform hyperparameter tuning for XGBoost"""
    # Create base pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(random_state=42))
    ])
    
    # Define parameter grid
    param_grid = {
        'model__n_estimators': [500, 1000, 1500],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [5, 7, 9],
        'model__min_child_weight': [1, 3, 5],
        'model__subsample': [0.8, 0.9, 1.0],
        'model__colsample_bytree': [0.8, 0.9, 1.0]
    }
    
    # Perform grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    grid_search.fit(X, y)
    
    return grid_search.best_estimator_


In [5]:

def evaluate_model(model, X_val, y_val):
    """Evaluate model performance"""
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_val, y_pred, alpha=0.5)
    plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
    plt.title('Actual vs Predicted Salaries')
    plt.xlabel('Actual Salary')
    plt.ylabel('Predicted Salary')
    plt.show()


In [6]:

def main():
    # Load and prepare data
    X_train, X_val, y_train, y_val, X, y = load_and_prepare_data()
    
    # Create preprocessor
    preprocessor = create_preprocessor()
    
    # Train and evaluate multiple models
    print("Training and evaluating multiple models...")
    results, best_model = train_models(X_train, X_val, y_train, y_val, preprocessor)
    
    # Print results
    print("\nModel Comparison:")
    for name, result in results.items():
        print(f"\n{name}:")
        print(f"R2 Score: {result['r2']:.4f}")
        print(f"RMSE: {result['rmse']:.2f}")
    
    # Tune XGBoost
    print("\nPerforming hyperparameter tuning for XGBoost...")
    tuned_xgb = tune_xgboost(X, y, preprocessor)
    
    # Evaluate tuned model
    print("\nEvaluating tuned XGBoost model:")
    evaluate_model(tuned_xgb, X_val, y_val)
    
    # Save the best model
    joblib.dump(tuned_xgb, 'salary_predictor.joblib')
    print("\nModel saved as 'salary_predictor.joblib'")

if __name__ == "__main__":
    main() 

KeyError: "['salary'] not found in axis"

2025-04-19 21:37:00,402 - INFO - Starting model building process...
2025-04-19 21:37:00,402 - INFO - Loading and preparing data...
2025-04-19 21:37:00,521 - INFO - Training data shape: (1280, 317)
2025-04-19 21:37:00,522 - INFO - Test data shape: (854, 316)
2025-04-19 21:37:00,523 - ERROR - Error loading data: "['salary'] not found in axis"
2025-04-19 21:37:00,523 - ERROR - Error in main process: "['salary'] not found in axis"


KeyError: "['salary'] not found in axis"