# Test Data Loading

**Author**: Divyanshu Patel - 23BAI1214

This notebook tests if the data loading is working correctly.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.join('..', '..', 'src'))

# Import custom modules
from data_loader import load_data, create_master_dataset

print("Testing data loading from notebook...")

In [None]:
# Load and preprocess the data
print("Loading and preprocessing data...")

# Load the real Himalayan expedition dataset
expeditions, members, peaks = load_data()

if expeditions is not None:
    print("Data loaded successfully!")
    print(f"Expeditions shape: {expeditions.shape}")
    print(f"Members shape: {members.shape}")
    print(f"Peaks shape: {peaks.shape}")
    
    # Create master dataset
    df = create_master_dataset(expeditions, members, peaks)
    if df is not None:
        print(f"Created master dataset: {df.shape}")
        print("First few rows of master dataset:")
        print(df.head())
    else:
        print("Failed to create master dataset")
else:
    print("Could not load real data.")

In [None]:
# Load and preprocess the data
print("Loading and preprocessing data...")

# Load the real Himalayan expedition dataset
# Make sure you've downloaded the dataset from Kaggle and placed it in data/
expeditions, members, peaks = load_data()

if expeditions is not None:
    print("Data loaded successfully!")
    
    # Create master dataset by joining all three DataFrames
    df = create_master_dataset(expeditions, members, peaks)
    print(f"Created master dataset: {df.shape}")
else:
    print("Could not load real data. Creating sample data for demonstration.")
    
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 1000
    
    sample_data = {
        'age': np.random.randint(20, 65, n_samples),
        'sex': np.random.choice(['M', 'F'], n_samples),
        'season': np.random.choice(['Spring', 'Autumn', 'Winter', 'Summer'], n_samples),
        'members': np.random.randint(1, 20, n_samples),
        'hired_staff': np.random.randint(0, 15, n_samples),
        'heightm': np.random.randint(6000, 8900, n_samples),
        'o2used': np.random.choice([True, False], n_samples),
        'totmembers': np.random.randint(1, 20, n_samples),
        'success1': np.random.choice([True, False], n_samples)
    }
    
    df = pd.DataFrame(sample_data)
    print(f"Created sample dataset: {df.shape}")