In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip '/content/drive/MyDrive/spaceship-titanic.zip'

Archive:  /content/drive/MyDrive/spaceship-titanic.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.info()
# relevant: CryoSleep, Age, VIP, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
train = train.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name'], axis=1)
train['CryoSleep'] = train['CryoSleep'].fillna(False)
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['VIP'] = train['VIP'].fillna(False)
train['RoomService'] = train['RoomService'].fillna(train['RoomService'].mean())
train['FoodCourt'] = train['FoodCourt'].fillna(train['FoodCourt'].mean())
train['ShoppingMall'] = train['ShoppingMall'].fillna(train['ShoppingMall'].mean())
train['Spa'] = train['Spa'].fillna(train['Spa'].mean())
train['VRDeck'] = train['VRDeck'].fillna(train['VRDeck'].mean())
train['VIP'] = train['VIP'].map({False: 0, True: 1})
train['CryoSleep'] = train['CryoSleep'].map({False: 0, True: 1})
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     8693 non-null   int64  
 1   Age           8693 non-null   float64
 2   VIP           8693 non-null   int64  
 3   RoomService   8693 non-null   float64
 4   FoodCourt     8693 non-null   float64
 5   ShoppingMall  8693 non-null   float64
 6   Spa           8693 non-null   float64
 7   VRDeck        8693 non-null   float64
 8   Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), int64(2)
memory usage: 551.9 KB


In [8]:
test = test.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name'], axis=1)
test['CryoSleep'] = test['CryoSleep'].fillna(False)
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['VIP'] = test['VIP'].fillna(False)
test['RoomService'] = test['RoomService'].fillna(test['RoomService'].mean())
test['FoodCourt'] = test['FoodCourt'].fillna(test['FoodCourt'].mean())
test['ShoppingMall'] = test['ShoppingMall'].fillna(test['ShoppingMall'].mean())
test['Spa'] = test['Spa'].fillna(test['Spa'].mean())
test['VRDeck'] = test['VRDeck'].fillna(test['VRDeck'].mean())
test['VIP'] = test['VIP'].map({False: 0, True: 1})
test['CryoSleep'] = test['CryoSleep'].map({False: 0, True: 1})
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     4277 non-null   int64  
 1   Age           4277 non-null   float64
 2   VIP           4277 non-null   int64  
 3   RoomService   4277 non-null   float64
 4   FoodCourt     4277 non-null   float64
 5   ShoppingMall  4277 non-null   float64
 6   Spa           4277 non-null   float64
 7   VRDeck        4277 non-null   float64
dtypes: float64(6), int64(2)
memory usage: 267.4 KB


In [9]:
Scaler1 = StandardScaler()
Scaler2 = StandardScaler()

train_columns = train.columns
test_columns  = test.columns

df_train = pd.DataFrame(Scaler1.fit_transform(train))
df_test  = pd.DataFrame(Scaler2.fit_transform(test))

df_train.columns = train_columns
df_test.columns  = test_columns


In [10]:
features = df_train.iloc[:,:-1].columns.tolist()
target   = df_train.loc[:, 'Transported'].name

X_train = df_train.iloc[:,:-1].values
y_train = df_train.loc[:, 'Transported'].values

y_train = np.where(y_train > 0, 1, 0)

y_train

array([0, 1, 0, ..., 1, 0, 1])

In [11]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(8, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [12]:
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [13]:
batch_size = 64
n_epochs = 500
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf
for epoch in range(n_epochs):
    for i in range(batch_no):
        start = i * batch_size
        end   = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))

        optimizer.zero_grad()
        output = model(x_var)
        loss  = criterion(output,y_var)
        loss.backward()
        optimizer.step()

        values, labels = torch.max(output, 1)
        num_right   = np.sum(labels.data.numpy() == y_train[start:end])
        train_loss += loss.item()*batch_size

    train_loss = train_loss / len(X_train)
    if train_loss <= train_loss_min:
        print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
        torch.save(model.state_dict(), "model.pt")
        train_loss_min = train_loss

    if epoch % 200 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
print('Training Ended! ')

Validation loss decreased (   inf ===> 0.503817). Saving the model...

Epoch: 1 	Train Loss: 0.5038171875919775 	Train Accuracy: 0.734375
Validation loss decreased (0.503817 ===> 0.476952). Saving the model...
Validation loss decreased (0.476952 ===> 0.474662). Saving the model...
Validation loss decreased (0.474662 ===> 0.471336). Saving the model...
Validation loss decreased (0.471336 ===> 0.469658). Saving the model...
Validation loss decreased (0.469658 ===> 0.467761). Saving the model...
Validation loss decreased (0.467761 ===> 0.466744). Saving the model...
Validation loss decreased (0.466744 ===> 0.465747). Saving the model...
Validation loss decreased (0.465747 ===> 0.464457). Saving the model...
Validation loss decreased (0.464457 ===> 0.462588). Saving the model...
Validation loss decreased (0.462588 ===> 0.461684). Saving the model...
Validation loss decreased (0.461684 ===> 0.460739). Saving the model...
Validation loss decreased (0.460739 ===> 0.458682). Saving the model..

In [14]:
X_test     = df_test.iloc[:,:].values
X_test_var = Variable(torch.FloatTensor(X_test), requires_grad=False)
with torch.no_grad():
    test_result = model(X_test_var)
values, labels = torch.max(test_result, 1)
transported = labels.data.numpy()

In [15]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["Transported"] = transported
sample_submission["Transported"] = sample_submission["Transported"].map({1: True, 0: False})
sample_submission.to_csv("submission.csv", index=False)