In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [24]:
# Load the dataset
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [25]:
def custom_transform(train_values, test_values):
    le = LabelEncoder()
    le.fit(train_values)
    train_values = le.transform(train_values)

    test_mapping = {label: index for index, label in enumerate(le.classes_)}
    max_value = max(test_mapping.values())

    test_values = test_values.map(lambda x: test_mapping.get(x, max_value + 1))

    return train_values, test_values

In [26]:
# Perform EDA and preprocessing
train['HomePlanet'], test['HomePlanet'] = custom_transform(train['HomePlanet'], test['HomePlanet'])
train['Cabin'], test['Cabin'] = custom_transform(train['Cabin'], test['Cabin'])
train['Destination'], test['Destination'] = custom_transform(train['Destination'], test['Destination'])

In [27]:
# Drop unnecessary columns
train = train.drop(['PassengerId', 'Name'], axis=1)
test = test.drop(['Name'], axis=1)


In [29]:
# Split the dataset into training and validation sets
X = train.drop('Transported', axis=1)
y = train['Transported']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
print(train.isnull().sum())
print(test.isnull().sum())


HomePlanet        0
CryoSleep       217
Cabin             0
Destination       0
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64
PassengerId       0
HomePlanet        0
CryoSleep        93
Cabin             0
Destination       0
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64


In [31]:
# Fill missing values with the mean of the column
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)  # Use the mean values from the train dataset to avoid data leakage

In [32]:
print(train.isnull().sum())
print(test.isnull().sum())


HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64
PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64


In [33]:
# Split the dataset into training and validation sets
X = train.drop('Transported', axis=1)
y = train['Transported']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Choose a suitable machine learning algorithm
clf = RandomForestClassifier(random_state=42)

In [35]:
# Train the model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [36]:
# Evaluate the model
y_pred = clf.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7745830937320299
              precision    recall  f1-score   support

       False       0.76      0.79      0.78       861
        True       0.79      0.76      0.77       878

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



In [37]:
# Make predictions on the test set
test_ids = test['PassengerId']
test = test.drop(['PassengerId'], axis=1)
predictions = clf.predict(test)

In [39]:
# Create a submission file
submission = pd.DataFrame({'PassengerId': test_ids, 'Transported': predictions})
submission.to_csv('submission.csv', index=False)