In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")


TRAIN_FILE = "atlantis_citizens_final.csv"
TEST_FILE = "test_atlantis_hidden.csv"

print("Reading datasets...")
df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)

print("Train size:", df_train.shape)
print("Test size :", df_test.shape)


def build_features(data):
    temp = data.copy()

    
    if {"Wealth_Index", "House_Size_sq_ft"}.issubset(temp.columns):
        temp["Wealth_Density"] = temp["Wealth_Index"] / (temp["House_Size_sq_ft"] + 1)
        temp["Wealth_Log"] = np.log1p(temp["Wealth_Index"])

    
    if {"District_Name", "Work_District"}.issubset(temp.columns):
        temp["Commutes"] = (temp["District_Name"] != temp["Work_District"]).astype(int)

    
    if "Vehicle_Owned" in temp.columns:
        score_map = {
            "No Vehicle": 0,
            "Fin Bicycle": 1,
            "Sea Scooter": 2,
            "Submarine": 4,
            "Royal Submarine": 8
        }
        temp["Transport_Score"] = temp["Vehicle_Owned"].map(score_map).fillna(0)

    
    if "Bio_Hash" in temp.columns:
        temp["Hash_First"] = temp["Bio_Hash"].astype(str).str[:1]
        temp["Hash_Last"] = temp["Bio_Hash"].astype(str).str[-1:]
        temp["Hash_Size"] = temp["Bio_Hash"].astype(str).str.len()

   
    temp.drop(columns=[c for c in ["Citizen_ID", "Bio_Hash"] if c in temp.columns],
              inplace=True,
              errors="ignore")

    return temp


X_train = build_features(df_train.drop("Occupation", axis=1))
y_train = df_train["Occupation"]
X_test = build_features(df_test)


num_feats = [
    "House_Size_sq_ft",
    "Life_Expectancy",
    "Wealth_Density",
    "Wealth_Log",
    "Transport_Score",
    "Hash_Size"
]

cat_feats = [
    "Diet_Type",
    "District_Name",
    "Work_District",
    "Vehicle_Owned",
    "Hash_First",
    "Hash_Last"
]

num_feats = [f for f in num_feats if f in X_train.columns]
cat_feats = [f for f in cat_feats if f in X_train.columns]


num_pipe = Pipeline([
    ("fill", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

cat_pipe = Pipeline([
    ("fill", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

processor = ColumnTransformer([
    ("nums", num_pipe, num_feats),
    ("cats", cat_pipe, cat_feats)
])


rf = RandomForestClassifier(
    n_estimators=250,
    max_depth=18,
    random_state=42,
    n_jobs=-1
)

gb = GradientBoostingClassifier(
    n_estimators=180,
    learning_rate=0.08,
    max_depth=5,
    random_state=42
)

voter = VotingClassifier(
    estimators=[("forest", rf), ("boost", gb)],
    voting="soft"
)


pipeline = Pipeline([
    ("features", processor),
    ("model", voter)
])


encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_train)


print("Training model...")
pipeline.fit(X_train, y_encoded)

print("Generating predictions...")
pred_indices = pipeline.predict(X_test)
pred_labels = encoder.inverse_transform(pred_indices)


occupation_to_code = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}

final_predictions = [occupation_to_code[p] for p in pred_labels]


submission = pd.DataFrame({
    "Citizen_ID": df_test["Citizen_ID"],
    "Occupation": final_predictions
})

submission.to_csv("final_submission.csv", index=False)

print("final_submission.csv saved successfully")

Reading datasets...
Train size: (15751, 10)
Test size : (3938, 9)
Training model...
Generating predictions...
final_submission.csv saved successfully
