In [None]:
# Standard library imports
import sys
import urllib.request
from pathlib import Path

# Third party imports
import joblib
import pandas as pd

### Run configuration

In [None]:
# Flag to control environment-specific paths & configurations
KAGGLE = False

### Add ensemble_classifier module to path

In [None]:
# Add path to ensemble_classifier module
if KAGGLE:
    # On Kaggle, the module should be uploaded as part of the dataset
    module_path = Path('/kaggle/input/diabetes-challenge-ensemble-model-assets')
else:
    # For local/GitHub, use the models directory
    module_path = Path('../models').resolve()

sys.path.insert(0, str(module_path))

# Import ensemble classifier (needed for model deserialization)
from ensemble_classifier import EnsembleClassifier

print(f"Module path: {module_path}")
print(f"EnsembleClassifier imported successfully")

## 1. Asset loading

In [None]:
# Set file paths based on environment
if KAGGLE:
    # Kaggle paths - data is in /kaggle/input/
    test_df_path = '/kaggle/input/playground-series-s5e12/test.csv'
    model_path = '/kaggle/input/diabetes-challenge-ensemble-model-assets/ensemble_model.joblib'
else:
    # Local paths
    test_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_test.csv'
    
    # Find the most recent ensemble model
    models_dir = Path('../models')
    run_dirs = sorted([d for d in models_dir.glob('run_*') if d.is_dir()], reverse=True)
    
    if len(run_dirs) == 0:
        raise FileNotFoundError("No ensemble model runs found. Train a model first.")
    
    # Use most recent run
    latest_run = run_dirs[0]
    model_path = latest_run / 'ensemble_model.joblib'
    
    if not model_path.exists():
        raise FileNotFoundError(
            f"Model not found at {model_path}. "
            "Make sure to create the wrapped model in the training notebook."
        )

print(f"Loading test data from: {test_df_path}")
print(f"Loading model from: {model_path}")

# Load the testing dataset
test_df = pd.read_csv(test_df_path)
print(f"\nTest data shape: {test_df.shape}")

# Load the model
model = joblib.load(model_path)
print(f"\nModel loaded: {model}")

# Display first few rows
test_df.head()

## 2. Inference

In [None]:
print("Running inference...")
print(f"  Processing {len(test_df):,} samples through {model.n_models_} ensemble models")

# Make predictions
predictions = model.predict(test_df)

# Create submission dataframe
predictions_df = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'diagnosed_diabetes': predictions.astype(int)
})

print(f"\nPredictions complete!")
print(f"  Class distribution:")
print(predictions_df['diagnosed_diabetes'].value_counts(normalize=True).sort_index())

predictions_df.head(10)

## 3. Save submission file

In [None]:
# Set submission file path based on environment
if KAGGLE:
    submission_path = Path('submission.csv')
else:
    # Create data directory if it doesn't exist
    data_dir = Path('../data')
    data_dir.mkdir(parents=True, exist_ok=True)
    submission_path = data_dir / 'ensemble_submission.csv'

# Save submission file
predictions_df.to_csv(submission_path, index=False)
print(f'Submission saved to: {submission_path}')
print(f'File size: {submission_path.stat().st_size / 1024:.1f} KB')

## 4. Summary

In [None]:
print("=" * 80)
print("INFERENCE COMPLETE")
print("=" * 80)
print(f"Model: {model}")
print(f"Samples processed: {len(predictions_df):,}")
print(f"Submission file: {submission_path}")
print("=" * 80)