![](https://i.imgur.com/C66ez28.png)

# Diving into the Data 🤿 

> **train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
> - PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
> - HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
> - CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
> - Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
> - Destination - The planet the passenger will be debarking to.
> - Age - The age of the passenger.
> - VIP - Whether the passenger has paid for special VIP service during the voyage.
> - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
> - Name - The first and last names of the passenger.
> - Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

> **test.csv** - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. 

> 📝 To predict: the value of Transported for the passengers in this set

# Importing Libraries 🚀

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
import missingno as msno
import shap
import gc

import wandb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
from tqdm.notebook import tqdm

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.metrics import accuracy_score, roc_curve,auc, confusion_matrix,precision_recall_curve,precision_recall_curve,plot_precision_recall_curve

import warnings
warnings.simplefilter('ignore')

<center><img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67"></center>

I will be integrating ```W&B``` for ```visualizations``` and ```logging artifacts```!

[Spaceship Titanic Project on W&B Dashboard](https://wandb.ai/ruchi798/spaceship?workspace=user-ruchi798) 🏋️‍♀️

* To get the API key, an account is to be created on the website first.
* Next, use secrets to use API Keys more securely🤫

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("api_key")

CONFIG = {'competition': 'spaceship', '_wandb_kernel': 'ruch'}

os.environ["WANDB_SILENT"] = "true"

In [None]:
! wandb login $api_key

In [None]:
# Some utility functions
def wandb_log(**kwargs):
    """
    Logs a key-value pair to W&B
    """
    for k, v in kwargs.items():
        wandb.log({k: v})
        
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)


#defining colour palette
space = ["#440f5c", "#570990", "#8b22ba", "#8a3cf6", "#cac4ff", "#e4b6fe", "#fa79ff", "#e728dc"]
custom_palette(space)
    
#set context to customize and style plots
sns.set_context("poster", font_scale = 0.6, rc={"grid.linewidth": 0.4})

#set font family
sns.set_style({'font.family':'serif'})

# What does the data look like? 🔎 

In [None]:
train_data = pd.read_csv("../input/spaceship-titanic/train.csv")
print(train_data.shape)
train_data.head()

In [None]:
test_data = pd.read_csv("../input/spaceship-titanic/test.csv")
print(test_data.shape)
test_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.dtypes

In [None]:
train_data.nunique()

# Missing values🔮

In [None]:
msno.bar(train_data,color=space[2], sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

In [None]:
msno.bar(test_data,color=space[5], sort="ascending", figsize=(10,5), fontsize=12)
plt.show()

# EDA 🚀 

In [None]:
#====== Function to plot wandb bar chart ======
def plot_wb_bar(df,col1,col2,title): 
    name = col1
    run = wandb.init(project='spaceship', job_type='image-visualization',name=name)
    dt = [[label, val] for (label, val) in zip(df[col1], df[col2])]
    table = wandb.Table(data=dt, columns = [col1,col2])
    wandb.log({name : wandb.plot.bar(table, col1,col2,title=title)})
    run.finish()
    
#====== Function to create a dataframe of value counts ======
def count_values(df,col,top=False):
    df = pd.DataFrame(df[col].value_counts().reset_index().values,columns=[col, "counts"])
    if top==True: df=df[:10]
    return df

In [None]:
fig, ax  = plt.subplots(figsize=(16, 8))
fig.suptitle('Transported Distribution', size = 20, color = space[0])

explode = (0.05, 0.05)
labels = list(train_data['Transported'].value_counts().index)
sizes = train_data['Transported'].value_counts()
ax.pie(sizes, explode=explode, colors=space[6:], startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.6)
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('CryoSleep and VIP Distribution', size = 20, color = space[0])
axs = [ax1, ax2]

explode = (0.05, 0.05)
labels = list(train_data['CryoSleep'].value_counts().index)
sizes = train_data['CryoSleep'].value_counts()
ax1.pie(sizes, explode=explode, colors=space[2:], startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.6)
ax1.add_artist(plt.Circle((0,0),0.4,fc='white'))

explode = (0.05, 0.05)
labels = list(train_data['VIP'].value_counts().index)
sizes = train_data['VIP'].value_counts()
ax2.pie(sizes, explode=explode, colors=space[5:], startangle=60, labels=labels,autopct='%1.1f%%', pctdistance=0.9)
ax2.add_artist(plt.Circle((0,0),0.4,fc='white'))

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('HomePlanet and Destination Distribution', size = 20, color = space[0])
axs = [ax1, ax2]

explode = (0.05, 0.05,0.05)
labels = list(train_data['HomePlanet'].value_counts().index)
sizes = train_data['HomePlanet'].value_counts()
ax1.pie(sizes, explode=explode, colors=space[2:], startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.6)
ax1.add_artist(plt.Circle((0,0),0.4,fc='white'))

explode = (0.05, 0.05, 0.3)
labels = list(train_data['Destination'].value_counts().index)
sizes = train_data['Destination'].value_counts()
ax2.pie(sizes, explode=explode, colors=space[5:], startangle=60, labels=labels,autopct='%1.1f%%', pctdistance=0.9)
ax2.add_artist(plt.Circle((0,0),0.4,fc='white'))

plt.show()

In [None]:
plot_wb_bar(count_values(train_data,"Transported"),"Transported", "counts","Transported distribution")
plot_wb_bar(count_values(train_data,"CryoSleep"),"CryoSleep", "counts","CryoSleep distribution")
plot_wb_bar(count_values(train_data,"VIP"),"VIP", "counts","VIP distribution")
plot_wb_bar(count_values(train_data,"HomePlanet"),"HomePlanet", "counts","HomePlanet distribution")
plot_wb_bar(count_values(train_data,"Destination"),"Destination", "counts","Destination distribution")

In [None]:
def hist(col,title):
    
    plt.figure(figsize = (10,8))
    
    ax = sns.histplot(col,kde=False);
    
    values = np.array([patch.get_height() for patch in ax.patches])
    
    #normalizing the values to get a range of colours
    norm = plt.Normalize(values.min(), values.max())
    
    #range of colours from colourmap-rainbow
    colors = plt.cm.rainbow(norm(values))
    
    #set colour for each patch
    for patch, color in zip(ax.patches, colors):
        patch.set_color(color)

    plt.title(title, size = 20)
    
hist(train_data['Age'],'Distribution of Age')

In [None]:
train_data_cpy = train_data.copy()
bins= [0,2,4,13,20,110]
labels = ['Infant','Toddler','Kid','Teen','Adult']
train_data_cpy['AgeGroup'] = pd.cut(train_data_cpy['Age'], bins=bins, labels=labels, right=False)

plot_wb_bar(count_values(train_data_cpy,"AgeGroup"),"AgeGroup", "counts","Age distribution")

In [None]:
plt.figure(figsize=(30,35))
corr=train_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', vmax=.3, center=0,
            square=True, linewidths=.5,annot=True)
plt.show()

# Data pre-processing 🧑‍🍳

In [None]:
train_data['is_train'] = True
test_data['is_train'] = False

df = pd.concat([train_data, test_data])

In [None]:
def fill_missing_vals(df, fill_missing):
    for col in fill_missing: 
        df[col].fillna(df[col].median(skipna=True), inplace=True)
    return df

fill_missing_vals(df,['Age', 'RoomService' ,'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

df["HomePlanet"].fillna('Z', inplace=True)

In [None]:
def label_encode(df,col):
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])
    return df[col]

df['HomePlanet'] = label_encode(df,'HomePlanet')
df['CryoSleep'] = label_encode(df,'CryoSleep')
df['Destination'] = label_encode(df,'Destination')
df['VIP'] = label_encode(df,'VIP')

In [None]:
mask = df['is_train'] == True
train_data = df[mask]
test_data = df[~mask]

train_data = train_data.drop(['is_train'], axis=1)
test_data = test_data.drop(['is_train'], axis=1)

# Model Training ⚙️

![](https://upload.wikimedia.org/wikipedia/commons/9/96/Pytorch_logo.png)

In [None]:
# Start W&B logging
run = wandb.init(
        project="spaceship",
        config=CONFIG,
        job_type="train",
        anonymous="must"
    )

In [None]:
train_data = train_data.dropna()
train_data.drop(["PassengerId","Cabin","Name"], axis=1, inplace=True)
test_data.drop(["PassengerId","Cabin","Name"], axis=1, inplace=True)
train_data['Transported'] = train_data['Transported'].map({True: 1, False: 0})

In [None]:
class Config:
    lr = 1e-4
    nb_epochs = 5
    train_bs = 32
    valid_bs = 64
    train_split = 0.8
    k_folds = 5
    device = 'cpu'
    train_loss_fn = nn.BCEWithLogitsLoss()
    valid_loss_fn = nn.BCEWithLogitsLoss()
    feature_names = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    target_name = 'Transported'

In [None]:
class SpaceshipTitanicModel(nn.Module):
    def __init__(self, input_size=None, output_size=None):
        super(SpaceshipTitanicModel, self).__init__()
        self.input_size = 10 if not input_size else input_size
        self.output_size = 1 if not output_size else output_size
        
        # Model Architecture
        self.fc1 = nn.Linear(self.input_size, 1024)
        self.fc2 = nn.Linear(1024, 768)
        self.fc3 = nn.Linear(768, 128)
        self.fc4 = nn.Linear(128, self.output_size)
        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        out = self.sig(out)
        
        return out
    
def binary_acc(y_pred, y_test):
    y_pred = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
class SpaceshipTitanicData(Dataset):
    def __init__(self, features, target, is_test=False):
        self.features = features
        self.target = target
        self.is_test = is_test
    
    def __getitem__(self, idx):
        data = self.features.values[idx]
        if self.is_test:
            return torch.tensor(data, dtype=torch.float32)
        else:
            target = self.target.values[idx]
            return torch.tensor(data, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)
    
    def __len__(self):
        return len(self.features)

In [None]:
def train_fn(model, train_loader, optimizer, loss_fn, device):
    """
    Training function
    """
    print("Training")
    model.train()
    global y
    global z
    running_loss = 0
    all_targets = []
    all_preds = []
    
    prog_bar = tqdm(train_loader, total=len(train_loader))
    for x, y in prog_bar:
        x = x.to(device, torch.float32)
        y = y.to(device, torch.float32)
        
        z = model(x)
        train_loss = loss_fn(z, y)
        acc = binary_acc(z, y)
        train_loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        running_loss += train_loss
        prog_bar.set_description(f'loss: {train_loss.item():.2f}')
        
        all_targets.append(y.detach().cpu().numpy())
        all_preds.append(z.detach().cpu().numpy())
        
        wandb_log(train_loss=train_loss)
    
    return all_targets, all_preds

def valid_fn(model, valid_loader, loss_fn, device):
    """
    Validation function
    """
    print("Validating!")
    model.eval()
    running_loss = 0
    all_targets = []
    all_preds = []
    prog_bar = tqdm(valid_loader, total=len(valid_loader))
    for x, y in prog_bar:
        x = x.to(device, torch.float32)
        y = y.to(device, torch.float32)
        
        z = model(x)
        valid_loss = loss_fn(z, y)
        acc = binary_acc(z, y)
        
        running_loss += valid_loss
        prog_bar.set_description(f'loss: {valid_loss.item():.2f}')
        
        all_targets.append(y.detach().cpu().numpy())
        all_preds.append(z.detach().cpu().numpy())
        
        wandb_log(val_loss=valid_loss)
    
    print(f"Validation Loss: {running_loss:.4f}")
    print(f"Acc: {acc:.3f}")
    return all_targets, all_preds

In [None]:
if __name__ == "__main__":
    data = train_data.sample(frac=1).reset_index(drop=True)
    
    kfold = StratifiedKFold(n_splits=Config.k_folds, shuffle=True)
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(data.drop(['Transported'], axis=1), data[['Transported']])):
        print(f'FOLD {fold}')
        print('-'*20)
        
        train_ = data.loc[train_ids]
        valid_ = data.loc[valid_ids]
        
        train_dataset = SpaceshipTitanicData(
            features=train_.drop(['Transported'], axis=1),
            target=train_[['Transported']],
        )
        
        valid_dataset = SpaceshipTitanicData(
            features=valid_.drop(['Transported'], axis=1),
            target=valid_[['Transported']],
        )
        
        train_loader = DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True
        )
        
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=32,
            shuffle=False
        )
        
        model = SpaceshipTitanicModel(None, None)
        model.to(Config.device)
    
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=Config.lr)
        
        print("[INFO]: Starting training!\n")
        for epoch in range(1, Config.nb_epochs+2):
            print(f"{'='*20} Epoch: {epoch}/{Config.nb_epochs+1} {'='*20}")
            _, _ = train_fn(model, train_loader, optimizer, Config.train_loss_fn, device=Config.device)
            val_targets, val_preds = valid_fn(model, valid_loader, Config.valid_loss_fn, device=Config.device)
        
        filepath =  f"fold_{fold}_model.pth"
        torch.save(model.state_dict(), filepath)

In [None]:
# Finish the logging run
run.finish()

In [None]:
@torch.no_grad()
def inference(model, states_list, test_dataloader, device):
    """
    Do inference for different model folds
    """
    model.eval()
    all_preds = []
    for state in states_list:
        print(f"State: {state}")
        state_dict = torch.load(state)
        model.load_state_dict(state_dict)
        model = model.to(device)
        
        # Clean
        del state_dict
        gc.collect()
        
        preds = []
        prog = tqdm(test_dataloader, total=len(test_dataloader))
        for x in prog:
            x = x.to(device, dtype=torch.float32)
            outputs = model(x)
            preds.append(outputs.squeeze(-1).cpu().detach().numpy())
            
        all_preds.append(np.concatenate(preds))
        
        # Clean
        gc.collect()
        torch.cuda.empty_cache()
        
    return all_preds

In [None]:
model_dir = "/kaggle/working/"
states_list = [os.path.join("/kaggle/working", x) for x in os.listdir("/kaggle/working") if x.endswith(".pth")]

test_dataset = SpaceshipTitanicData(features=test_data.drop(['Transported'], axis = 1),
                                    target=None, 
                                    is_test = True)
test_loader = DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

print("Predictions for all folds")
predictions =  inference(model, states_list, test_loader, Config.device)
preds = pd.DataFrame(predictions).T.mean(axis=1).tolist()

In [None]:
preds = torch.round(torch.FloatTensor(preds)).numpy() > 0
preds

# Submission time! 🕒

In [None]:
passenger_IDs = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")[["PassengerId"]].values

df = {'PassengerId': passenger_IDs.ravel(), 'Transported': preds}
df_predictions = pd.DataFrame(df).set_index(['PassengerId'])
df_predictions.head(10)

In [None]:
df_predictions.to_csv('/kaggle/working/Predictions.csv')

This is what my [project](https://wandb.ai/ruchi798/spaceship?workspace=user-ruchi798) looks like on the W&B dashboard ⬇️

![](https://i.imgur.com/WZ51NKI.png)