In [2]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')


In [3]:
print(train_df.info())
print(train_df.describe()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.00000

In [4]:
missing_percentage = (train_df.isnull().sum() / len(train_df)) * 100
print(missing_percentage)


PassengerId     0.000000
HomePlanet      2.312205
CryoSleep       2.496261
Cabin           2.289198
Destination     2.093639
Age             2.059128
VIP             2.335212
RoomService     2.082135
FoodCourt       2.105142
ShoppingMall    2.392730
Spa             2.105142
VRDeck          2.162660
Name            2.300702
Transported     0.000000
dtype: float64


In [5]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
def extract_group_from_passenger_id(df):
    # extract group from PassengerId
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    # remove PassengerId
    if 'Transported' in df.columns:
        df.drop('PassengerId', axis=1, inplace=True)
    
def convert_boolean_to_int(df):
    df['CryoSleep'].fillna(df['CryoSleep'].mode()[0], inplace=True)
    df['CryoSleep'] = df['CryoSleep'].astype('int')  # Ensure it's boolean type

    df['VIP'].fillna(df['VIP'].mode()[0], inplace=True)
    df['VIP'] = df['VIP'].astype('int')

    # check if it contains Transported
    if 'Transported' in df.columns:
        df['Transported'] = df['Transported'].astype(int)

def extract_cabin_information(df):
    df['Cabin'] = df['Cabin'].fillna('Unknown/0/Unknown').astype(str)
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df['Cabin'].str.split('/', expand=True)

    # drop Cabin
    df.drop('Cabin', axis=1, inplace=True)

def handle_missing_values(df):
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['RoomService'].fillna(df['RoomService'].median(), inplace=True)
    df['FoodCourt'].fillna(df['FoodCourt'].median(), inplace=True)
    df['ShoppingMall'].fillna(df['ShoppingMall'].median(), inplace=True)
    df['Spa'].fillna(df['Spa'].median(), inplace=True)
    df['VRDeck'].fillna(df['VRDeck'].median(), inplace=True)

    df['HomePlanet'] = df['HomePlanet'].fillna('Unknown')
    df['Destination'] = df['Destination'].fillna('Unknown')

def drop_name_column(df):
    df.drop('Name', axis=1, inplace=True)

In [7]:
# make a copy of train_df
train_df_processed = train_df.copy()

extract_group_from_passenger_id(train_df_processed)
convert_boolean_to_int(train_df_processed)
extract_cabin_information(train_df_processed)
handle_missing_values(train_df_processed)
drop_name_column(train_df_processed)

In [8]:
missing_percentage = (train_df_processed.isnull().sum() / len(train_df_processed)) * 100
print(missing_percentage)


HomePlanet      0.0
CryoSleep       0.0
Destination     0.0
Age             0.0
VIP             0.0
RoomService     0.0
FoodCourt       0.0
ShoppingMall    0.0
Spa             0.0
VRDeck          0.0
Transported     0.0
Group           0.0
CabinDeck       0.0
CabinNum        0.0
CabinSide       0.0
dtype: float64


In [9]:
train_df_processed.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,CabinDeck,CabinNum,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,F,1,S


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

X = train_df_processed.drop(columns=['Transported'])
y = train_df_processed['Transported'].astype(int)

categorical_cols = ['HomePlanet', 'CryoSleep', 'CabinDeck', 'CabinNum', 'CabinSide', 'Destination', 'VIP']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocesser = ColumnTransformer([('cat', encoder, categorical_cols)], remainder='passthrough')

X_train_encoded = preprocesser.fit_transform(X_train)
X_val_encoded = preprocesser.transform(X_val)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)
print('Accuracy:', accuracy_score(y_val, y_pred))

Accuracy: 0.7349051178838413


In [11]:
# make a copy of train_df
test_df_processed = test_df.copy()

extract_group_from_passenger_id(test_df_processed)
convert_boolean_to_int(test_df_processed)
extract_cabin_information(test_df_processed)
handle_missing_values(test_df_processed)
drop_name_column(test_df_processed)

In [12]:
X_test = test_df_processed.copy()
X_test_passenger_id = X_test['PassengerId'].copy()
X_test = X_test.drop(columns=['PassengerId'])


categorical_cols = ['HomePlanet', 'CryoSleep', 'CabinDeck', 'CabinNum', 'CabinSide', 'Destination', 'VIP']

X_test_encoded = preprocesser.transform(X_test)
X_test_scaled = scaler.transform(X_test_encoded)

y_pred = model.predict(X_test_scaled)

# Restore PassengerId for submission
submission = pd.DataFrame({'PassengerId': X_test_passenger_id, 'Transported': y_pred})

# replace 0 and 1 with False and True
submission['Transported'] = submission['Transported'].map({0: False, 1: True})

# Save to CSV for Kaggle submission
submission.to_csv('submission.csv', index=False)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocesser = ColumnTransformer([('cat', encoder, categorical_cols)], remainder='passthrough')

X_train_encoded = preprocesser.fit_transform(X_train)
X_val_encoded = preprocesser.transform(X_val)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train_encoded, y_train)


In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],  # Reduce options
    'max_depth': [10, 15, 20, 25, None],  # Skip very shallow trees
    'min_samples_split': [2, 10],  # Avoid unnecessary values
    'min_samples_leaf': [1, 4],  # Focus on key values
    'max_features': ['sqrt'],  # sqrt is usually the best choice
    'bootstrap': [True],  # Bootstrap is standard for Random Forest
}



grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, verbose=2)
grid_search.fit(X_train_encoded, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total

In [15]:
model = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=2, random_state=42)
model = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42, max_features='sqrt', bootstrap=True)
model.fit(X_train_encoded, y_train)

In [16]:
# Make predictions on validation data
y_pred = model.predict(X_val_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.8016101207590569


In [17]:
y_pred = model.predict(X_test_encoded)

In [18]:
# Restore PassengerId for submission
submission = pd.DataFrame({'PassengerId': X_test_passenger_id, 'Transported': y_pred})

# replace 0 and 1 with False and True
submission['Transported'] = submission['Transported'].map({0: False, 1: True})

# Save to CSV for Kaggle submission
submission.to_csv('submission.csv', index=False)