In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Plotting libraries
import seaborn as sns

# ML libraries
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

# Unable warnings
import warnings
warnings.filterwarnings("ignore")

## Import data

In [None]:
# Import training data
input_folder = '/kaggle/input/spaceship-titanic/'
train_data = pd.read_csv(input_folder+'train.csv')

# Select categorical columns
categorical_cols = [cname for cname in train_data.columns
                    if train_data[cname].nunique() < 4 and train_data[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in train_data.columns
                  if train_data[cname].dtype in ['int64', 'float64']]

# Import test data
test_data = pd.read_csv(input_folder+'test.csv')

## Preprocessing

In [None]:
# Preprocessing
# -----------------------------------------------------------------------------

# Define function to clean the data
def data_cleaner(data):
    # Drop unneeded columns
    clean_data = data.drop(['PassengerId', 'Cabin', 'Name'], axis=1)
    
    return clean_data

# Fill missing values in numerical columns
numerical_transformer = SimpleImputer(strategy='constant',)

# Convert categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Craft a Random Forest model

In [None]:
# Random Forest model
model_rf = RandomForestClassifier(min_samples_leaf=6,
                                  random_state=1)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_rf)
                             ])

## Train and evaluate the model

In [None]:
# Clean the data
clean_data = data_cleaner(train_data)

# Select X and y
X = clean_data.drop(['Transported'], axis=1)
y = clean_data[['Transported']]

# Preprocessing of training data, fit model 
my_pipeline.fit(X, y)

# Evaluate the model
cv_score = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')
print(cv_score.mean(), cv_score.std())

## Use XGBoost

In [None]:
# XGboost model
model_xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, n_jobs=4,
                         random_state=1)

# New pipeline with XGBoost
xg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_xgb)
                             ]) 

# Fit model
xg_pipeline.fit(X, y)

# Evaluate the model
cv_score_xg = cross_val_score(xg_pipeline, X, y, cv=5, scoring='accuracy')
print(cv_score_xg.mean(), cv_score_xg.std())

## Make and store prediction

In [None]:
# Clean the test data
X_test = data_cleaner(test_data)

# Make prediction for test data
y_test = xg_pipeline.predict(X_test)
y_test = y_test.astype(bool) # convert 1/0 to True/False
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                           'Transported': y_test
                          })

# Save
submission.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('/kaggle/working/submission.csv')