# Titanic Classification 

- Description: Method of classification using QR code and convolutional neural network
- Author: Guilherme Righetto
- Sense: None

In [1]:
import qrcode
import pandas as pd
import os, glob
import PIL.Image
from collections import OrderedDict
from sklearn.model_selection import train_test_split

import numpy as np
import torch
from torch import nn
import torch.utils.data
import torch.optim.lr_scheduler
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision.models.resnet import BasicBlock

from lib import pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

CREATE_QR = False

GPU available: True


### Data Munging 

In [2]:
df_train = pd.read_csv("Data/train.csv", sep=",")
df_test = pd.read_csv("Data/test.csv", sep=",")

df_train["_data"] = "train"
df_test["_data"] = "test"

list_test = df_test["PassengerId"].tolist()

df_data = pd.concat([df_train, df_test], ignore_index=True)

In [3]:
df_data.loc[df_data["Sex"] == 'male', "Sex"] = 0
df_data.loc[df_data["Sex"] == 'female', "Sex"] = 1
df_data["Sex"] = df_data["Sex"].astype(int)

In [4]:
df_data["Embarked"].fillna("S", inplace=True)

df_data.loc[df_data["Embarked"] == "S", "Embarked"] = 0
df_data.loc[df_data["Embarked"] == "C", "Embarked"] = 1
df_data.loc[df_data["Embarked"] == "Q", "Embarked"] = 2
df_data["Embarked"] = df_data["Embarked"].astype(int)

df_tmp = pd.get_dummies(df_data["Embarked"], prefix="Embarked")
del df_data["Embarked"]
df_data = pd.concat([df_data, df_tmp], axis=1)

In [163]:
df_data["Cabin"].fillna("N", inplace=True)
df_data["Cabin"] = df_data["Cabin"].str.replace(r'\d+',"")
df_data["Cabin"] = df_data["Cabin"].str.replace(r' .*',"")

df_data.loc[df_data["Cabin"] == "A", "Cabin"] = 0
df_data.loc[df_data["Cabin"] == "B", "Cabin"] = 1
df_data.loc[df_data["Cabin"] == "C", "Cabin"] = 3
df_data.loc[df_data["Cabin"] == "D", "Cabin"] = 9
df_data.loc[df_data["Cabin"] == "E", "Cabin"] = 15
df_data.loc[df_data["Cabin"] == "F", "Cabin"] = 31
df_data.loc[df_data["Cabin"] == "G", "Cabin"] = 63
df_data.loc[df_data["Cabin"] == "T", "Cabin"] = 127
df_data.loc[df_data["Cabin"] == "N", "Cabin"] = 255
df_data["Cabin"] = df_data["Cabin"].astype(int)

df_tmp = pd.get_dummies(df_data["Cabin"], prefix="Cabin")
del df_data["Cabin"]
df_data = pd.concat([df_data, df_tmp], axis=1)

In [5]:
df_data["Ticket"] = df_data["Ticket"].str.replace(r'\W',"")
df_data["Ticket"] = df_data["Ticket"].str.replace(r'\d',"")
df_data.loc[df_data["Ticket"] == "", "Ticket"] = "N"
df_data["Ticket"] = df_data["Ticket"].astype('category').cat.codes

df_tmp = pd.get_dummies(df_data["Ticket"], prefix="Ticket")
del df_data["Ticket"]
df_data = pd.concat([df_data, df_tmp], axis=1)

In [6]:
df_data["Name"] = df_data["Name"].str.replace(r'.*Mr.*',"Mrs")
df_data["Name"] = df_data["Name"].str.replace(r'.*Miss.*',"Miss")
df_data["Name"] = df_data["Name"].str.replace(r'.*Master.*',"Master")
df_data["Name"] = df_data["Name"].str.replace(r'.*Dr.*',"Dr")
df_data["Name"] = df_data["Name"].str.replace(r'.*Mlle.*|.*Mme.*',"Mlle")
df_data["Name"] = df_data["Name"].str.replace(r'.*Capt.*|.*Don.*|.*Major.*|.*Sir.*|.*Jonkheer.*',"Sir")
df_data["Name"] = df_data["Name"].str.replace(r'.*Dona.*|.*Lady.*|.*the Countess.*',"lady")
df_data["Name"] = df_data["Name"].str.replace(r'.* .*',"Others")

df_data.loc[df_data["Name"] == "Miss", "Name"] = 127
df_data.loc[df_data["Name"] == "Mlle", "Name"] = 63
df_data.loc[df_data["Name"] == "lady", "Name"] = 31
df_data.loc[df_data["Name"] == "Master", "Name"] = 15
df_data.loc[df_data["Name"] == "Sir", "Name"] = 9
df_data.loc[df_data["Name"] == "Dr", "Name"] = 3
df_data.loc[df_data["Name"] == "Mrs", "Name"] = 1
df_data.loc[df_data["Name"] == "Others", "Name"] = 0
df_data["Name"] = df_data["Name"].astype(int)

df_tmp = pd.get_dummies(df_data["Name"], prefix="Name")
del df_data["Name"]
df_data = pd.concat([df_data, df_tmp], axis=1)

In [7]:
df_data["FamilySize"] = df_data["SibSp"] + df_data["Parch"] + 1

df_data["Age"].fillna(int(df_data["Age"].mean()), inplace=True)

df_data["Children"] = 0
df_data.loc[df_data["Age"] < 16, "Children"] = 1

In [8]:
df_data.columns

Index(['Age', 'Cabin', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex',
       'SibSp', 'Survived', '_data', 'Embarked_0', 'Embarked_1', 'Embarked_2',
       'Ticket_0', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5',
       'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9', 'Ticket_10',
       'Ticket_11', 'Ticket_12', 'Ticket_13', 'Ticket_14', 'Ticket_15',
       'Ticket_16', 'Ticket_17', 'Ticket_18', 'Ticket_19', 'Ticket_20',
       'Ticket_21', 'Ticket_22', 'Ticket_23', 'Ticket_24', 'Ticket_25',
       'Ticket_26', 'Ticket_27', 'Ticket_28', 'Ticket_29', 'Ticket_30',
       'Ticket_31', 'Ticket_32', 'Name_0', 'Name_1', 'Name_3', 'Name_9',
       'Name_15', 'Name_31', 'Name_63', 'Name_127', 'FamilySize', 'Children'],
      dtype='object')

In [9]:
df_train = df_data.loc[df_data["_data"] == "train"].copy()
del df_train["_data"]

df_test = df_data.loc[df_data["_data"] == "test"].copy()
del df_test["_data"]

In [10]:
df_train, df_validation = train_test_split(df_train, test_size=0.33, random_state=42)

### Converting data to QR code

In [11]:
if(CREATE_QR):
    i=0
    for line in df_train.iterrows():
        qr = qrcode.QRCode(version=2, error_correction=qrcode.constants.ERROR_CORRECT_L, box_size=2, border=0)

        data = line[1]

        if(data["Survived"] == 1.0):
            del data["Survived"]
            qr.add_data(list(data.values))
            qr.make(fit=True)
            img = qr.make_image()
            img.save("Data/Train/1_" + str(i) + ".png")
        elif(data["Survived"] == 0.0):
            del data["Survived"]
            qr.add_data(list(data.values))
            qr.make(fit=True)
            img = qr.make_image()
            img.save("Data/Train/0_" + str(i) + ".png")
        i+=1
    for line in df_validation.iterrows():
        qr = qrcode.QRCode(version=2, error_correction=qrcode.constants.ERROR_CORRECT_L, box_size=2, border=0)

        data = line[1]

        if(data["Survived"] == 1.0):
            del data["Survived"]
            qr.add_data(list(data.values))
            qr.make(fit=True)
            img = qr.make_image()
            img.save("Data/Validation/1_" + str(i) + ".png")
        elif(data["Survived"] == 0.0):
            del data["Survived"]
            qr.add_data(list(data.values))
            qr.make(fit=True)
            img = qr.make_image()
            img.save("Data/Validation/0_" + str(i) + ".png")
        i+=1
    for line in df_test.iterrows():
        qr = qrcode.QRCode(version=2, error_correction=qrcode.constants.ERROR_CORRECT_L, box_size=2, border=0)

        data = line[1]

        del data["Survived"]
        qr.add_data(list(data.values))
        qr.make(fit=True)
        img = qr.make_image()
        img.save("Data/Test/" + str(i) + ".png")
        i+=1



In [171]:
img.size

(154, 154)

### Dataset

In [12]:
class TitanicDataset(torch.utils.data.Dataset):
    def __init__(self, rootdir, train=True, transform=None):
        self.train = train
        self.transform = transform
        if self.train:
            self.dirpath = os.path.join(rootdir, 'Train/')
        else:
            self.dirpath = os.path.join(rootdir, 'Validation/')

        self.l_filepaths = [fp for fp in sorted(glob.glob(os.path.join(self.dirpath, '**'), recursive=True))
                            if fp[-4:].lower() == '.png']
        
    def __len__(self):
        return len(self.l_filepaths)
    
    def __getitem__(self, index):
        fp = self.l_filepaths[index]
        out = int(os.path.basename(fp)[:1].lower() == '0')
        inp = PIL.Image.open(fp).convert('L')
        if self.transform is not None:
            inp = self.transform(inp)
        return inp, out

In [13]:
rootdir = 'Data/'

size_final = (150, 150)
transf_comp_train = torchvision.transforms.Compose([torchvision.transforms.Scale(size=size_final),
                                                    torchvision.transforms.ToTensor()])

transf_comp_valid = torchvision.transforms.Compose([torchvision.transforms.Scale(size=size_final),
                                                    torchvision.transforms.ToTensor()])

dataset_train = TitanicDataset(rootdir, train=True, transform=transf_comp_train)
dataset_valid = TitanicDataset(rootdir, train=False, transform=transf_comp_valid)


( 0 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0039  0.0627  ...   0.0627  0.0039  0.0000
  0.0000  0.0627  1.0000  ...   1.0000  0.0627  0.0000
           ...             ⋱             ...          
  0.0000  0.0627  1.0000  ...   0.0000  0.9373  1.0000
  0.0000  0.0039  0.0627  ...   0.0000  0.9373  1.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.9373  1.0000
[torch.FloatTensor of size 1x150x150]

In [None]:
loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=40, shuffle=True, num_workers=4)
loader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=40, shuffle=False, num_workers=4)

dataloaders = {'train': loader_train,
               'val': loader_valid
              }

print('Size - Train:', len(df_train))
print('Size - Test:', len(df_validation))

### Classification

In [38]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        
        # Camadas convolucionais
        self.conv_layer = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels=1, out_channels=20, kernel_size=3, padding=1)),   
            ('relu1', nn.ReLU()),
            ('max_pool1', nn.MaxPool2d(2)),
#             ('drop1', nn.Dropout(p=0.5)),
            
            ('conv2', nn.Conv2d(in_channels=20, out_channels=50, kernel_size=5, padding=1)),
            ('relu2', nn.ReLU()),
            ('max_pool2', nn.MaxPool2d(2)),

            ('conv3', nn.Conv2d(in_channels=50, out_channels=30, kernel_size=3, padding=1)),
            ('relu3', nn.ReLU()),
            ('max_pool3', nn.MaxPool2d(2)),
            ('drop3', nn.Dropout(p=0.5))
        ]))
        
        # Camadas densas
        self.dense_layer = nn.Sequential(OrderedDict([
            ('dense1', nn.Linear(in_features=9720, out_features=450)),
            ('relu1', nn.ReLU()),
#             ('dense2', nn.Linear(in_features=450, out_features=100)),
#             ('relu2', nn.ReLU()),
#             ('drop2', nn.Dropout(p=0.5)),
            ('dense3', nn.Linear(in_features=450, out_features=2)),
        ]))
        
    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(-1, 9720) 
        x = self.dense_layer(x)
        
        return x

In [39]:
model = MyModel()

# input do mesmo tamanho que as imagens usadas
example_input = Variable(torch.zeros(1, 1, 150, 150))

# output da camada convolucional
example_output = model.conv_layer(example_input)

print('Dimensões da saída convolucional:', example_output.size())
print('Dimensão após a vetorização:', example_output.view(1, -1).size())

Dimensões da saída convolucional: torch.Size([1, 30, 18, 18])
Dimensão após a vetorização: torch.Size([1, 9720])


In [40]:
model = MyModel()
if use_gpu:
    model = model.cuda()
    
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adadelta(model.parameters(), lr=0.1)

trainer = ptt.DeepNetTrainer(
    model=model, 
    criterion=criterion, 
    optimizer=optimizer,
    #lr_scheduler=0.01,#lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1),
    callbacks = [ptt.AccuracyMetric(),
                 ptt.PrintCallback(),
                 ptt.ModelCheckpoint('Models/titanic-ori', reset=True, verbose=1)],
    use_gpu=True
)

In [42]:
trainer.fit_loader(n_epochs=30, train_data=dataloaders['train'], valid_data=dataloaders['val'])

Start training for 30 epochs
 11:   2.1s   T: 0.63554 0.62919   V: 0.65145 0.59322 
 12:   2.1s   T: 0.62565 0.63926   V: 0.69386 0.59322 
 13:   2.1s   T: 0.62658 0.62919   V: 0.64053 0.60678 best
 14:   2.1s   T: 0.63333 0.64094   V: 0.63486 0.61356 best
 15:   2.1s   T: 0.63362 0.64430   V: 0.65754 0.59322 
 16:   2.1s   T: 0.62419 0.64765   V: 0.62907 0.60678 best
 17:   2.2s   T: 0.62004 0.65940   V: 0.62929 0.61356 
 18:   2.1s   T: 0.61587 0.66275   V: 0.63434 0.61695 
 19:   2.1s   T: 0.61022 0.67785   V: 0.62693 0.61356 best
 20:   2.1s   T: 0.60907 0.70134   V: 0.64468 0.61695 
 21:   2.1s   T: 0.60761 0.67617   V: 0.62449 0.62034 best
 22:   2.1s   T: 0.60812 0.67450   V: 0.64472 0.61695 
 23:   2.1s   T: 0.59602 0.66611   V: 0.66486 0.61695 
 24:   2.1s   T: 0.59080 0.66946   V: 0.61987 0.63051 best
 25:   2.1s   T: 0.59166 0.68960   V: 0.68772 0.61695 
 26:   2.1s   T: 0.60129 0.68456   V: 0.61666 0.62373 best
 27:   2.1s   T: 0.58454 0.69128   V: 0.61883 0.63390 
 28:   2