In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [3]:
y = train_data.Transported
X = train_data.drop(['Transported'],axis = 1)


In [4]:
from sklearn.model_selection import train_test_split
X_train_full, X_valid_full, y_train, y_valid = train_test_split( X, y, train_size=0.8, test_size= 0.2, random_state = 0)

In [5]:
X_train_full.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
4278,4558_01,Europa,False,C/167/S,55 Cancri e,54.0,False,0.0,559.0,0.0,15238.0,2799.0,Wezna Baleful
5971,6326_01,Earth,False,F/1307/P,TRAPPIST-1e,20.0,False,0.0,20.0,1.0,696.0,0.0,Therek Hinetthews
464,0503_02,Mars,False,F/90/S,TRAPPIST-1e,43.0,False,1821.0,0.0,47.0,29.0,0.0,Torms Fone
4475,4757_01,Earth,False,F/896/S,TRAPPIST-1e,24.0,False,185.0,0.0,476.0,1810.0,53.0,Tanley Mirandry
8469,9046_01,Europa,True,C/335/S,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Alphah Cratrave


In [6]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].nunique()<10 and X_train_full[col].dtype == 'object']

In [7]:
print(categorical_cols)

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']


In [8]:
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]
numerical_cols

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [9]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
len(my_cols)

10

In [10]:
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [12]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state=0, n_jobs=4)

In [13]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)




In [14]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)


MAE: 0.28618376798718137


In [15]:
predictions = my_pipeline.predict(X_test)

In [16]:
predictions

array([0.5765455 , 0.01      , 1.        , ..., 0.99176876, 0.83      ,
       0.05083333])

In [17]:
l_transport = []
for i in predictions:
    if i >= 1:
        l_transport.append(True)
    else:
        l_transport.append(False)

In [18]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Transported': l_transport})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [19]:
output

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
