# Himalayan Expedition Success Prediction - XGBoost Model

**Author**: 

This notebook demonstrates the implementation of an XGBoost model for predicting Himalayan expedition success.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.join('..', '..', 'src'))

# Import custom modules
from data_loader import load_data, create_master_dataset
from utils import prepare_features, save_model, evaluate_model

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("XGBoost Model Implementation for Himalayan Expedition Success Prediction")

In [None]:
# Load and preprocess the data
print("Loading and preprocessing data...")

# Load the real Himalayan expedition dataset
# Make sure you've downloaded the dataset from Kaggle and placed it in data/
expeditions, members, peaks = load_data()

if expeditions is not None:
    print("Data loaded successfully!")
    
    # Create master dataset by joining all three DataFrames
    df = create_master_dataset(expeditions, members, peaks)
    print(f"Created master dataset: {df.shape}")
else:
    print("Could not load real data. Creating sample data for demonstration.")
    
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 1000
    
    sample_data = {
        'age': np.random.randint(20, 65, n_samples),
        'sex': np.random.choice(['M', 'F'], n_samples),
        'season': np.random.choice(['Spring', 'Autumn', 'Winter', 'Summer'], n_samples),
        'members': np.random.randint(1, 20, n_samples),
        'hired_staff': np.random.randint(0, 15, n_samples),
        'heightm': np.random.randint(6000, 8900, n_samples),
        'o2used': np.random.choice([True, False], n_samples),
        'totmembers': np.random.randint(1, 20, n_samples),
        'success1': np.random.choice([True, False], n_samples)
    }
    
    df = pd.DataFrame(sample_data)
    print(f"Created sample dataset: {df.shape}")

In [None]:
# Prepare features for modeling
print("Preparing features...")

X, y, encoders = prepare_features(df)
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

In [None]:
# XGBoost Model Implementation
print("Training XGBoost model...")

# Create XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

# Calculate accuracy
results = evaluate_model(y_test, y_pred)
print(f"XGBoost Model Accuracy: {results['accuracy']:.4f}")

In [None]:
# Save the model
print("Saving the model...")
save_model(xgb_model, encoders, scaler, 'xgboost')
print("Model saved successfully!")

print("\nXGBoost model training completed!")
print(f"Accuracy: {results['accuracy']:.4f}")
print("Model saved to saved_models/xgboost_model.pkl")