In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
def preprocess_df(df):
    df = df.copy()
    room_bucket_size = 200
    age_bucket_size = 5
    # df.Transported = df.Transported.astype(int)
    df["Deck"] = df.Cabin.str.split("/").str[0]
    df["Room"] = pd.to_numeric(df.Cabin.str.split("/").str[1], errors="coerce")
    df["Room-Bucket"] = (df["Room"] // room_bucket_size).astype("Int64")
    df["Side"] = df.Cabin.str.split("/").str[2]
    df["Age-Bucket"] = (df["Age"] // age_bucket_size).astype("Int64")
    df["Under-5"] = df["Age"] < 5
    return df

In [None]:
train_df = preprocess_df(pd.read_csv("./data/train.csv"))
submission_df = preprocess_df(pd.read_csv("./data/test.csv"))

In [4]:
train_df.groupby(["HomePlanet"])["Transported"].describe()

Unnamed: 0_level_0,count,unique,top,freq
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Earth,4602,2,False,2651
Europa,2131,2,True,1404
Mars,1759,2,True,920


In [5]:
train_df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'Deck', 'Room', 'Room-Bucket', 'Side',
       'Age-Bucket', 'Under-5'],
      dtype='object')

In [25]:
# y = train_df["Transported"]

train_df, test_df = train_test_split(train_df, test_size=0.0005, random_state=42)
features = ["CryoSleep", "Destination", "HomePlanet", "Deck", "Side", "Under-5", "VIP", ]
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), features),
    ]
)
X_encoded = preprocessor.fit_transform(train_df[features])
X_test_encoded = preprocessor.transform(test_df[features])

# Convert to DataFrame to use .head() method


In [26]:
model = RandomForestClassifier(
    n_estimators=100, max_depth=6, random_state=1, bootstrap=True
)
model.fit(X_encoded, train_df["Transported"])

predictions = model.predict(X_encoded)
accuracy = accuracy_score(train_df["Transported"], predictions)

predictions_test = model.predict(X_test_encoded)
accuracy_test = accuracy_score(test_df["Transported"], predictions_test)
print(accuracy)
print(accuracy_test)

0.7433547204399633
1.0


In [28]:
X_submission_encoded = preprocessor.transform(submission_df[features])
predictions = model.predict(X_submission_encoded)
output = pd.DataFrame(
    {"PassengerId": submission_df.PassengerId, "Transported": predictions}
)
output.to_csv("submission-spacehip-titanic.csv", index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
