In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
all = concat_df(train, test)
sample_submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [3]:
all[['Deck','Num','Side']] = all['Cabin'].str.split("/",expand=True)
all['Name'].fillna('John Doe', inplace=True)
all['Surname'] = all['Name'].str.split().str[-1]
all['FamilySize'] = all.groupby('Surname')['Surname'].transform('size')
all[['Group','Group_No']] = all['PassengerId'].str.split("_",expand=True)
all.drop(columns=['Name','PassengerId','Cabin'],inplace=True)

In [4]:
columns_median = ["ShoppingMall","Spa","VRDeck","FoodCourt","RoomService"]
for col in columns_median:
    all[col].fillna(all[col].median(),inplace=True)
all['TotalSpend'] = all['RoomService'] + all["ShoppingMall"] + all["Spa"] + all["VRDeck"] + all["FoodCourt"]
all['CryoSleep'] = np.where((all['TotalSpend'] > 0) & (all['CryoSleep'].isna()), False, all['CryoSleep'])
all['CryoSleep'] = np.where((all['TotalSpend'] <= 0) & (all['CryoSleep'].isna()), True, all['CryoSleep'])
all['Age'].fillna(all['Age'].median(),inplace=True)
all['VIP'] = all['VIP'].fillna(False).infer_objects(copy=False)
all['VIP'] = all['VIP'].astype(str)
all.dtypes

Age             float64
CryoSleep        object
Destination      object
FoodCourt       float64
HomePlanet       object
RoomService     float64
ShoppingMall    float64
Spa             float64
Transported      object
VIP              object
VRDeck          float64
Deck             object
Num              object
Side             object
Surname          object
FamilySize        int64
Group            object
Group_No         object
TotalSpend      float64
dtype: object

In [5]:
all['Surname'] = all['Surname'].replace('John Doe', np.nan)


mask_familysize_gt_1 = all['FamilySize'] > 1
all.loc[all['Destination'].isna() & mask_familysize_gt_1, 'Destination'] = all.groupby('Surname')['Destination'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
all.loc[all['Destination'].isna() & ~mask_familysize_gt_1, 'Destination'] = 'TRAPPIST-1e'
all['VIP'].value_counts()

VIP
False    12697
True       273
Name: count, dtype: int64

In [6]:
num_mode = all['Num'].mode()[0]
all['Num'] = all['Num'].fillna(num_mode)
all['Num'] = all['Num'].astype('int64')

all.loc[all['Deck'].isna() & mask_familysize_gt_1, 'Deck'] = all.groupby('Surname')['Deck'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
all.loc[all['Deck'].isna() & ~mask_familysize_gt_1, 'Deck'] = all['Deck'].mode()[0]

all.loc[all['Side'].isna() & mask_familysize_gt_1, 'Side'] = all.groupby('Surname')['Side'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
all.loc[all['Side'].isna() & ~mask_familysize_gt_1, 'Side'] = all['Side'].mode()[0]

hp_mode = all['HomePlanet'].mode()[0]
all.loc[all['HomePlanet'].isna() & mask_familysize_gt_1, 'HomePlanet'] = all.groupby('Surname')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
all.loc[all['HomePlanet'].isna() & ~mask_familysize_gt_1, 'HomePlanet'] = all['HomePlanet'].mode()[0]
all['HomePlanet'].fillna('Earth', inplace=True)

In [7]:
train = all.loc[:8692]
test = all.loc[8693:]

In [8]:
train.sample(5)

Unnamed: 0,Age,CryoSleep,Destination,FoodCourt,HomePlanet,RoomService,ShoppingMall,Spa,Transported,VIP,VRDeck,Deck,Num,Side,Surname,FamilySize,Group,Group_No,TotalSpend
6399,30.0,False,PSO J318.5-22,0.0,Mars,51.0,1373.0,0.0,True,False,5.0,F,1404,P,Pri,4,6760,1,1429.0
8198,24.0,True,TRAPPIST-1e,0.0,Mars,0.0,0.0,0.0,True,False,0.0,F,1807,P,Mine,1,8766,1,0.0
6993,45.0,True,55 Cancri e,0.0,Europa,0.0,0.0,0.0,True,False,0.0,B,279,S,Disgul,5,7437,1,0.0
762,27.0,False,55 Cancri e,3462.0,Europa,0.0,5.0,1590.0,False,False,4.0,A,6,P,Presstic,5,799,1,5061.0
2546,16.0,True,TRAPPIST-1e,0.0,Earth,0.0,0.0,0.0,True,False,0.0,G,433,S,Douglasen,8,2730,3,0.0


In [9]:
train['VIP'].value_counts()

VIP
False    8494
True      199
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
dfs = [train, test]
for df in dfs:
    scaler = StandardScaler()
    df[['Age', 'TotalSpend', 'Num']] = scaler.fit_transform(df[['Age', 'TotalSpend', 'Num']])


    family_size_mode = df.loc[df['FamilySize'] != 294, 'FamilySize'].mode()[0]
    df['FamilySize'] = df['FamilySize'].replace(294, family_size_mode)


    df.drop(columns=['Surname','Group'], inplace=True)


    le = LabelEncoder()
    df['Group_No'] = le.fit_transform(df['Group_No'])
    df['Deck'] = le.fit_transform(df['Deck'])


train_encoded = pd.get_dummies(train, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side'], drop_first=True)
test_encoded = pd.get_dummies(test, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side'], drop_first=True)

In [11]:
train_encoded.sample(10)

Unnamed: 0,Age,FoodCourt,RoomService,ShoppingMall,Spa,Transported,VRDeck,Deck,Num,FamilySize,Group_No,TotalSpend,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,Side_S
734,-0.264305,0.0,0.0,0.0,0.0,False,0.0,6,-0.930939,7,0,-0.514066,False,False,True,False,True,False,False
5375,0.921142,16.0,0.0,0.0,144.0,False,7492.0,0,-1.048161,3,0,2.215979,True,False,False,False,True,False,False
6283,-0.055109,0.0,0.0,0.0,0.0,True,0.0,6,0.971962,6,0,-0.514066,False,False,True,False,False,False,True
6273,-0.194573,0.0,0.0,0.0,0.0,True,0.0,2,-0.66719,6,0,-0.514066,True,False,True,False,True,False,True
3079,1.897393,335.0,0.0,532.0,15.0,False,26.0,6,-0.092803,11,0,-0.190114,False,False,False,False,True,False,False
4652,-0.334037,1018.0,3.0,687.0,50.0,True,0.0,6,0.428834,9,0,0.113145,False,False,False,False,True,False,True
8242,0.22382,0.0,81.0,1363.0,0.0,True,0.0,5,2.187162,8,0,0.001118,False,True,False,False,True,False,True
3374,-0.055109,0.0,0.0,0.0,0.0,True,0.0,5,0.321381,6,0,-0.514066,False,True,True,False,True,False,False
6527,-2.00761,0.0,0.0,0.0,0.0,True,0.0,5,1.427174,9,1,-0.514066,False,True,False,False,True,False,True
7351,0.22382,0.0,0.0,0.0,0.0,True,0.0,2,-0.57732,5,1,-0.514066,True,False,True,False,True,False,True


In [12]:
test_encoded.dtypes

Age                          float64
FoodCourt                    float64
RoomService                  float64
ShoppingMall                 float64
Spa                          float64
Transported                   object
VRDeck                       float64
Deck                           int32
Num                          float64
FamilySize                     int64
Group_No                       int32
TotalSpend                   float64
HomePlanet_Europa               bool
HomePlanet_Mars                 bool
CryoSleep_True                  bool
Destination_PSO J318.5-22       bool
Destination_TRAPPIST-1e         bool
VIP_True                        bool
Side_S                          bool
dtype: object

In [13]:
# Prepare features and target
X = train_encoded.drop(['Transported'], axis=1)
y = train_encoded['Transported'].astype(int)
X_test = test_encoded.drop(['Transported'], axis=1)

In [14]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
def model_training(model):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print("Accuracy:", accuracy_score(y_val, preds))
    print("Cross validation score:", cross_val_score(model, X, y, cv=5))
    print("Cross validation score mean:", np.mean(cross_val_score(model, X, y, cv=5)))
    print("\nClassification Report:")
    print(classification_report(y_val, preds))

In [15]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=100, learning_rate=0.1)
model_training(model)

Accuracy: 0.8039102932719954
Cross validation score: [0.76883266 0.75790684 0.7906843  0.82853855 0.78250863]
Cross validation score mean: 0.7856941974905884

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       861
           1       0.80      0.82      0.81       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [16]:
model = RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=42,
                                           n_jobs=-1,
                                           verbose=1)
model.fit(X, y)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
sample_submission['Transported'] = predictions
sample_submission['Transported'] = sample_submission['Transported'].map({1: True, 0: False})
sample_submission.to_csv("submission.csv", index=False)