In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

warnings.filterwarnings('ignore') 
pd.set_option('display.max_columns', None)

train_path = 'atlantis_citizens_final.csv'
test_path = 'test_atlantis_hidden.csv'

print(f"Loading data from: {train_path} and {test_path}...")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f"Train Shape: {train_df.shape}")
print(f"Test Shape:  {test_df.shape}")

def engineer_features(df_in):
    """
    Applies the specific 'Rules of Thumb' discovered during EDA.
    """
    df = df_in.copy()
    
   
    if 'Wealth_Index' in df.columns:
        df['Log_Wealth'] = np.log1p(df['Wealth_Index'])
    
    if 'Wealth_Index' in df.columns and 'House_Size_sq_ft' in df.columns:
        df['Wealth_Per_SqFt'] = df['Wealth_Index'] / (df['House_Size_sq_ft'] + 1)

    if 'District_Name' in df.columns and 'Work_District' in df.columns:
        df['Is_Commuter'] = (df['District_Name'] != df['Work_District']).astype(int)

    if 'Vehicle_Owned' in df.columns:
        vehicle_map = {
            'No Vehicle': 0,
            'Fin Bicycle': 1,
            'Sea Scooter': 2,
            'Submarine': 3,
            'Royal Submarine': 10 # High jump to emphasize status
        }
        df['Vehicle_Score'] = df['Vehicle_Owned'].map(vehicle_map).fillna(0)

    if 'Bio_Hash' in df.columns:
        df['Hash_Start'] = df['Bio_Hash'].astype(str).str[0]
        df['Hash_End']   = df['Bio_Hash'].astype(str).str[-1]
        df['Hash_Len']   = df['Bio_Hash'].astype(str).apply(len)

    drop_cols = ['Citizen_ID', 'Bio_Hash']
    df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
    
    return df

print("Applying Feature Engineering...")
X = engineer_features(train_df.drop('Occupation', axis=1))
y = train_df['Occupation']
X_test = engineer_features(test_df)

numeric_features = ['House_Size_sq_ft', 'Life_Expectancy', 'Wealth_Per_SqFt', 'Log_Wealth', 'Vehicle_Score', 'Hash_Len']
categorical_features = ['Diet_Type', 'District_Name', 'Work_District', 'Vehicle_Owned', 'Hash_Start', 'Hash_End']

numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())                   
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

rf_model = RandomForestClassifier(
    n_estimators=300, 
    max_depth=20, 
    random_state=42, 
    n_jobs=-1
)

gb_model = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.1, 
    max_depth=6, 
    random_state=42
)

ensemble = VotingClassifier(
    estimators=[('rf', rf_model), ('gb', gb_model)], 
    voting='soft'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ensemble)
])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Training Ensemble Model... (This handles the heavy lifting)")
model_pipeline.fit(X, y_encoded)

print("Predicting on Hidden Test Set...")
test_preds_idx = model_pipeline.predict(X_test)
test_preds_str = le.inverse_transform(test_preds_idx)

custom_mapping = {
    'Warrior': 0,
    'Merchant': 1,
    'Fisher': 2,
    'Miner': 3,
    'Scribe': 4
}

final_values = [custom_mapping[label] for label in test_preds_str]

submission = pd.DataFrame({
    'Citizen_ID': test_df['Citizen_ID'],
    'Occupation': final_values
})

submission_filename = 'submission_final.csv'
submission.to_csv('predictionS', index=False)

Loading data from: atlantis_citizens_final.csv and test_atlantis_hidden.csv...
Train Shape: (15751, 10)
Test Shape:  (3938, 9)
Applying Feature Engineering...
Training Ensemble Model... (This handles the heavy lifting)
Predicting on Hidden Test Set...
