In [1]:
# =======================
# Imports & Configuration
# =======================
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn: Preprocessing, Modeling, Evaluation, Feature Selection
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.impute import SimpleImputer

# Statistical tools
from scipy import stats
from scipy.stats import chi2_contingency

# Matplotlib & seaborn configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9

import warnings
warnings.filterwarnings('ignore')

## Step 0: Data Loading and Inspection

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
# Let's print some descriptive statistics for all the numeric features.

df.describe().T
# This gives that it is highly right-skewed for all numeric features apart from age
# The age distribution is right-skewed. The majority of passengers are young adults (20–30).
# The median is close to the mean, the skew isn't too extreme.

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,8514.0,28.82793,14.489021,0.0,19.0,27.0,38.0,79.0
RoomService,8512.0,224.687617,666.717663,0.0,0.0,0.0,47.0,14327.0
FoodCourt,8510.0,458.077203,1611.48924,0.0,0.0,0.0,76.0,29813.0
ShoppingMall,8485.0,173.729169,604.696458,0.0,0.0,0.0,27.0,23492.0
Spa,8510.0,311.138778,1136.705535,0.0,0.0,0.0,59.0,22408.0
VRDeck,8505.0,304.854791,1145.717189,0.0,0.0,0.0,46.0,24133.0


In [5]:
# Let's print some descriptive statistics for all the numeric features.

df.describe().T# What is the number of unique values in all the categorical features? And what is
# the value with the highest frequency?

df.describe(include=object).T
# can frop 'Name' feature
# looking at the dataset from cabin, can see whether or not passenger is solo or in a group.

Unnamed: 0,count,unique,top,freq
PassengerId,8693,8693,0001_01,1
HomePlanet,8492,3,Earth,4602
CryoSleep,8476,2,False,5439
Cabin,8494,6560,G/734/S,8
Destination,8511,3,TRAPPIST-1e,5915
VIP,8490,2,False,8291
Name,8493,8473,Gollux Reedall,2


In [6]:
# How much missing data is in each feature?

df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [7]:
# For convienience, let's save the names of all numeric features to a list,
# and the names of all categorical features to another list.

numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

categorical_features = ['HomePlanet', 'VIP', 'CryoSleep', 'Destination', 'Cabin', 'Name']

In [8]:
# ================================================================
# SPACESHIP TITANIC: RESCUE MISSION - ADVANCED EDA & FEATURE ENGINEERING
# ================================================================
# Mission: Extract every signal from damaged ship logs to save more lives
# Objective: Build AI-powered triage engine for passenger rescue prediction
# Target: Every 1% accuracy improvement = hundreds more lives saved

print("🚀 SPACESHIP TITANIC RESCUE MISSION INITIATED")
print("=" * 60)

# ================================================================
# PHASE 1: INTELLIGENCE GATHERING - LOAD & INITIAL INSPECTION
# ================================================================

def load_and_inspect_data():
    """Load data and perform initial intelligence gathering"""
    print("\n📊 PHASE 1: INTELLIGENCE GATHERING")
    print("-" * 40)

    # Load training data
    df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_train.csv")

    print(f"🔍 Mission Log Analysis:")
    print(f"   - Total passengers in manifest: {len(df):,}")
    print(f"   - Data integrity: {df.shape[1]} features recorded")
    print(f"   - Missing data assessment needed...")

    return df

def comprehensive_data_quality_analysis(df):
    """Deep dive into data quality and patterns"""
    print("\n🔬 COMPREHENSIVE DATA QUALITY ANALYSIS")
    print("-" * 45)

    # Basic info
    print(f"Dataset Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Missing data analysis
    missing_analysis = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2),
        'Data_Type': df.dtypes
    }).sort_values('Missing_Percentage', ascending=False)

    print("\n📋 MISSING DATA INTELLIGENCE REPORT:")
    print(missing_analysis[missing_analysis['Missing_Count'] > 0])

    # Class distribution analysis
    print("\n⚖️ TARGET DISTRIBUTION ANALYSIS:")
    target_dist = df['Transported'].value_counts()
    print(f"Transported: {target_dist[True]:,} ({target_dist[True]/len(df)*100:.1f}%)")
    print(f"Not Transported: {target_dist[False]:,} ({target_dist[False]/len(df)*100:.1f}%)")

    # Data types summary
    print(f"\n📊 DATA TYPES SUMMARY:")
    dtype_summary = df.dtypes.value_counts()
    for dtype, count in dtype_summary.items():
        print(f"   {dtype}: {count} features")

    return missing_analysis


try:
    # Execute the complete EDA pipeline
    print("🚀 Starting Spaceship Titanic Rescue Mission Analysis...")

    # Phase 1: Load and inspect data
    df = load_and_inspect_data()

    # Phase 2: Comprehensive data quality analysis
    missing_analysis = comprehensive_data_quality_analysis(df)

    # Mission completion summary
    print("\n" + "=" * 60)
    print("🎉 RESCUE MISSION EDA COMPLETE!")
    print("=" * 60)
    print(f"✅ Analyzed {len(df):,} passenger records")
    print(f"✅ Generated comprehensive visualizations")
    print(f"✅ Identified key rescue patterns")
    print("\n🚨 Ready for model development phase!")

except Exception as e:
    print(f"❌ Mission encountered error: {str(e)}")
    print("🔧 Check data source and dependencies")

🚀 SPACESHIP TITANIC RESCUE MISSION INITIATED
🚀 Starting Spaceship Titanic Rescue Mission Analysis...

📊 PHASE 1: INTELLIGENCE GATHERING
----------------------------------------
🔍 Mission Log Analysis:
   - Total passengers in manifest: 8,693
   - Data integrity: 14 features recorded
   - Missing data assessment needed...

🔬 COMPREHENSIVE DATA QUALITY ANALYSIS
---------------------------------------------
Dataset Shape: (8693, 14)
Memory Usage: 3.65 MB

📋 MISSING DATA INTELLIGENCE REPORT:
                    Column  Missing_Count  Missing_Percentage Data_Type
CryoSleep        CryoSleep            217                2.50    object
ShoppingMall  ShoppingMall            208                2.39   float64
VIP                    VIP            203                2.34    object
HomePlanet      HomePlanet            201                2.31    object
Name                  Name            200                2.30    object
Cabin                Cabin            199                2.29    object
VRD

In [32]:
# ================================================================
# Feature Engineering Helper Functions
# ================================================================

def handle_missing_values(df, categorical_features):
    """Handle missing values for numerical and categorical features"""
    df = df.copy()

    # Impute Age with mean
    age_imputer = SimpleImputer()
    df['Age'] = age_imputer.fit_transform(df[['Age']]).flatten()

    # Impute spending columns with most frequent value
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    spent_imputer = SimpleImputer(strategy='most_frequent')
    df[spending_cols] = spent_imputer.fit_transform(df[spending_cols])

    # Fill categorical missing values with 'Missing'
    for col in categorical_features:
        df[col] = df[col].fillna('Missing')

    return df

def create_features(df):
    """Create engineered features"""
    df = df.copy()

    # Solo traveler feature
    df['SoloTraveler'] = (df['Cabin'].map(df['Cabin'].value_counts()) == 1)

    # Spending features
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spending_cols].sum(axis=1)
    df['LuxurySpend'] = df[['Spa', 'VRDeck']].sum(axis=1)
    df['BasicSpend'] = df[['RoomService', 'FoodCourt']].sum(axis=1)

    # Cabin-based interaction features
    cabin_prefix = df['Cabin'].astype(str).str.split('/').str[0]
    df['Cabin_HomePlanet'] = cabin_prefix + "_" + df['HomePlanet'].astype(str)
    df['Cabin_Destination'] = cabin_prefix + "_" + df['Destination'].astype(str)
    df['Cabin_CryoSleep'] = cabin_prefix + "_" + df['CryoSleep'].astype(str)

    return df

def convert_data_types(df):
    """Convert columns to appropriate data types"""
    df = df.copy()

    # Convert to categorical
    categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep', 'SoloTraveler',
                        'Cabin_HomePlanet', 'Cabin_Destination', 'Cabin_CryoSleep']

    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df

def select_features_and_prepare_target(df, features_to_drop=None, is_train=True):
    """Select features and prepare target variable"""
    df = df.copy()

    # Default features to drop
    default_drop = ['PassengerId', 'Cabin', 'Name']

    if features_to_drop:
        default_drop.extend(features_to_drop)

    # Remove target variable from features if it exists
    if is_train and 'Transported' in df.columns:
        y = df['Transported']
        default_drop.append('Transported')
    else:
        y = None

    # Select features
    X = df.drop(default_drop, axis=1, errors='ignore')

    # Get categorical feature names
    categorical_features = X.select_dtypes(include='category').columns.tolist()

    return X, y, X.columns.tolist(), categorical_features

def encode_categorical_features(X, categorical_features, encoders=None, fit=True):
    """Encode categorical features using LabelEncoder"""
    X_encoded = X.copy()

    if encoders is None:
        encoders = {}

    for col in categorical_features:
        if col in X_encoded.columns:
            X_encoded[col] = X_encoded[col].astype(str)

            if fit:
                # Fit new encoder for training data
                le = LabelEncoder()
                X_encoded[col] = le.fit_transform(X_encoded[col])
                encoders[col] = le
            else:
                # Use existing encoder for test data
                if col in encoders:
                    # Handle unseen categories
                    le = encoders[col]
                    mask = X_encoded[col].isin(le.classes_)
                    X_encoded.loc[mask, col] = le.transform(X_encoded.loc[mask, col])
                    X_encoded.loc[~mask, col] = -1  # Assign -1 to unseen categories

                    # Explicitly convert to int64 to match training data
                    X_encoded[col] = X_encoded[col].astype('int64')
                else:
                    # If no encoder exists, assign -1 to all values
                    X_encoded[col] = -1
                    X_encoded[col] = X_encoded[col].astype('int64')

    # Ensure all categorical features are properly converted to int64
    for col in categorical_features:
        if col in X_encoded.columns:
            X_encoded[col] = pd.to_numeric(X_encoded[col], errors='coerce').fillna(-1).astype('int64')

    return X_encoded, encoders

# ================================================================
# FEATURE ENGINEERING PIPELINE
# ================================================================

def run_feature_engineering_pipeline(df):
    """
    Complete Feature Engineering Pipeline matching original implementation

    Processes raw data through all steps including encoding to output ML-ready dataset.

    Args:
        df: Raw passenger dataframe

    Returns:
        final_df: Fully processed dataframe with encoded features ready for ML
    """
    print("🚀 SPACESHIP TITANIC FEATURE ENGINEERING PIPELINE")
    print("=" * 60)

    # Create working copy and preserve passenger ID
    df = df.copy()
    if 'PassengerId' in df.columns:
        df['PassengerId'] = df['PassengerId'].astype(str)
        df['_PassengerId_Original'] = df['PassengerId']

    original_features = df.shape[1]

    # Step 1: Missing Value Imputation
    print("\n🩹 STEP 1: MISSING VALUE IMPUTATION")
    print("-" * 40)

    categorical_features = ['HomePlanet', 'VIP', 'CryoSleep', 'Destination', 'Cabin', 'Name']
    df = handle_missing_values(df, categorical_features)
    print("   ✅ Age: Missing values filled with mean")
    print("   ✅ Spending columns: Imputed with most frequent values")
    print("   ✅ Categorical columns: Missing values marked as 'Missing'")

    # Step 2: Feature Creation
    print("\n⚡ STEP 2: NEW FEATURE CREATION")
    print("-" * 35)

    df = create_features(df)
    print("   ✅ SoloTraveler: Binary feature based on cabin occupancy")
    print("   ✅ TotalSpend: Sum of all amenity spending")
    print("   ✅ LuxurySpend: Spa + VRDeck spending")
    print("   ✅ BasicSpend: RoomService + FoodCourt spending")
    print("   ✅ Cabin interactions: Cabin prefix + passenger attributes")

    # Step 3: Data Type Optimization
    print("\n🤖 STEP 3: DATA TYPE OPTIMIZATION")
    print("-" * 38)

    df = convert_data_types(df)
    print("   ✅ Categorical columns converted to 'category' dtype")

    # Step 4: Feature Selection and Target Preparation
    print("\n🎯 STEP 4: FEATURE SELECTION")
    print("-" * 30)

    X, y, feature_names, cat_features = select_features_and_prepare_target(
        df, is_train=('Transported' in df.columns)
    )
    print(f"   ✅ Selected {len(feature_names)} features for ML")
    print(f"   ✅ Identified {len(cat_features)} categorical features")

    # Step 5: Categorical Encoding
    print("\n🔢 STEP 5: CATEGORICAL ENCODING")
    print("-" * 35)

    X_encoded, encoders = encode_categorical_features(X, cat_features, fit=True)
    print(f"   ✅ {len(cat_features)} categorical features encoded to numeric")
    print("   ✅ All features now ML-ready")

    # Step 6: Final Dataset Assembly
    print("\n📦 STEP 6: FINAL DATASET ASSEMBLY")
    print("-" * 36)

    # Create final dataset with encoded features
    final_df = X_encoded.copy()

    # Add back essential columns
    if 'PassengerId' in df.columns:
        final_df['PassengerId'] = df['PassengerId'].values
    if '_PassengerId_Original' in df.columns:
        final_df['_PassengerId_Original'] = df['_PassengerId_Original'].values
    if y is not None:
        final_df['Transported'] = y.values

    print("   ✅ Final dataset assembled with all encoded features")
    print("   ✅ Essential identification columns preserved")

    # Summary
    print("\n" + "=" * 60)
    print("🎉 FEATURE ENGINEERING COMPLETE!")
    print(f"   📊 Original features: {original_features}")
    print(f"   📊 ML features: {X_encoded.shape[1]}")
    print(f"   📊 Total columns: {final_df.shape[1]} (includes IDs and target)")
    print("   🚀 Dataset fully processed and ML-ready")
    print("=" * 60)

    # Return final processed dataframe
    final_df = final_df.drop('_PassengerId_Original', axis=1, errors='ignore')

    return final_df

In [33]:
# 1. Load the raw dataset
df_train = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_train.csv")

# 2. Run the full feature engineering pipeline (ensure the function is defined)
df_processed_train = run_feature_engineering_pipeline(df_train)

# 3: Export the complete processed dataframe
df_processed_train.to_csv('train_dataset_spaceship_titanic_processed.csv', index=False)
print(f"   ✅ Complete dataset exported: train_dataset_spaceship_titanic_processed.csv")
print(f"   📊 Shape: {df_processed_train.shape}")

🚀 SPACESHIP TITANIC FEATURE ENGINEERING PIPELINE

🩹 STEP 1: MISSING VALUE IMPUTATION
----------------------------------------
   ✅ Age: Missing values filled with mean
   ✅ Spending columns: Imputed with most frequent values
   ✅ Categorical columns: Missing values marked as 'Missing'

⚡ STEP 2: NEW FEATURE CREATION
-----------------------------------
   ✅ SoloTraveler: Binary feature based on cabin occupancy
   ✅ TotalSpend: Sum of all amenity spending
   ✅ LuxurySpend: Spa + VRDeck spending
   ✅ BasicSpend: RoomService + FoodCourt spending
   ✅ Cabin interactions: Cabin prefix + passenger attributes

🤖 STEP 3: DATA TYPE OPTIMIZATION
--------------------------------------
   ✅ Categorical columns converted to 'category' dtype

🎯 STEP 4: FEATURE SELECTION
------------------------------
   ✅ Selected 18 features for ML
   ✅ Identified 8 categorical features

🔢 STEP 5: CATEGORICAL ENCODING
-----------------------------------
   ✅ 8 categorical features encoded to numeric
   ✅ All feature

In [34]:
# 1. Load the raw dataset for Test set
df_test = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_test.csv")

# 2. Run the full feature engineering pipeline (ensure the function is defined)
df_processed_test = run_feature_engineering_pipeline(df_test)

# 3: Export the complete processed dataframe
df_processed_test.to_csv('test_dataset_spaceship_titanic_processed.csv', index=False)
print(f"   ✅ Complete dataset exported: test_dataset_spaceship_titanic_processed.csv")
print(f"   📊 Shape: {df_processed_test.shape}")

🚀 SPACESHIP TITANIC FEATURE ENGINEERING PIPELINE

🩹 STEP 1: MISSING VALUE IMPUTATION
----------------------------------------
   ✅ Age: Missing values filled with mean
   ✅ Spending columns: Imputed with most frequent values
   ✅ Categorical columns: Missing values marked as 'Missing'

⚡ STEP 2: NEW FEATURE CREATION
-----------------------------------
   ✅ SoloTraveler: Binary feature based on cabin occupancy
   ✅ TotalSpend: Sum of all amenity spending
   ✅ LuxurySpend: Spa + VRDeck spending
   ✅ BasicSpend: RoomService + FoodCourt spending
   ✅ Cabin interactions: Cabin prefix + passenger attributes

🤖 STEP 3: DATA TYPE OPTIMIZATION
--------------------------------------
   ✅ Categorical columns converted to 'category' dtype

🎯 STEP 4: FEATURE SELECTION
------------------------------
   ✅ Selected 18 features for ML
   ✅ Identified 8 categorical features

🔢 STEP 5: CATEGORICAL ENCODING
-----------------------------------
   ✅ 8 categorical features encoded to numeric
   ✅ All feature

In [35]:
df_processed_test.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,SoloTraveler,TotalSpend,LuxurySpend,BasicSpend,Cabin_HomePlanet,Cabin_Destination,Cabin_CryoSleep,PassengerId
0,0,2,3,27.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,16,25,20,0013_01
1,0,0,3,19.0,0,0.0,9.0,0.0,2823.0,0.0,1,2832.0,2823.0,9.0,13,21,15,0018_01
2,1,2,0,31.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,4,7,8,0019_01
3,1,0,3,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,7418.0,766.0,6652.0,4,10,6,0021_01
4,0,0,3,20.0,0,10.0,0.0,635.0,0.0,0.0,1,645.0,0.0,10.0,13,21,15,0023_01
