In [1]:
import kaggle
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim
import pkbar



In [2]:
def feature_engineer(df):
    titles = []
    for name in df['Name']:
        title = name.split(',')[-1].split('.')[0]
        titles.append(title[1:])
    df['Name'] = titles
    df.rename(columns = {'Name':'Title'}, inplace = True)

    decks = []
    for cabin in df['Cabin']:
        if pd.notna(cabin): decks.append(cabin[0])
        else: decks.append(cabin)
    df['Cabin'] = decks
    df.rename(columns = {'Cabin':'Deck'}, inplace = True)

#     fam_size  = []
#     for i in range(len(df)):
#         fam_size.append(df['SibSp'][i]+df['Parch'][i])
#     df['FamSize'] = fam_size

    fare_per_person = []
    for i in range(len(df)):
        fam_size = df['SibSp'][i]+df['Parch'][i]
        fare_per_person.append(df['Fare'][i]/(fam_size+1))
    df['FarePerPerson'] = fare_per_person
    
    
#     age_times_class = []
#     for i in range(len(df)):
#         age_times_class.append(df['Pclass'][i]*df['Age'][i])
#     df['AgeClass'] = age_times_class
    
    return df

In [3]:
def categorify(df, columns):
    rectifier = 1
    for column in columns:
        category_dict = {}
        unique_categories = df[column].unique()
        unknown_index = (unique_categories != unique_categories).nonzero()[0]
        unique_categories = unique_categories.tolist()

        if len(unknown_index): 
            category_indexes = list(range(len(unique_categories)))
            unknown = unique_categories[unknown_index[0]]
            category_indexes[0] = unknown
            unique_categories.pop(unknown_index[0])
            unique_categories.insert(0,unknown)

        else:
            category_indexes = list(range(1,len(unique_categories)+1))

        for i in range(len(unique_categories)):
            category_dict[unique_categories[i]] = category_indexes[i]

        new_column = []
        for i in range(len(df)):
            new_value = category_dict[df[column][i]]
            if new_value == new_value: new_value = int(new_value)
            new_column.append(new_value)

        df[column] = new_column
#         df[column] = new_column.astype('category')
    return df

In [4]:
def fill_missing(df, ignore_columns):
    for column in df.columns:
        if column in ignore_columns: continue
        if df[column].isna().any():
            median = df[column].describe()['50%']
            nan_indexes = df[column].isna().values
            new_column = np.array(df[column])
            new_column[nan_indexes] = median
#             new_column = new_column.astype('int')
            df[column] = new_column
            df[f'{column}Missing'] = nan_indexes 
    return df

In [5]:
def integerize(df, columns):
    for column in columns:
        df = df.astype({column: 'int'})
    return df

In [6]:
def normalize(df, columns):
    for column in columns:
        df[column] = complete_df[column].values / complete_df[column].values.max()
    return df

In [7]:
class DatasetTabular(Dataset):
    def __init__(self, df, cont, cat, target_key = None):
        self.df = df
        self.cat = cat
        self.cont = cont
#         self.cardinalities = [len(df[column].unique()) for column in cat]
        self.cardinalities = [complete_df[column].nunique() for column in cat]
#         self.target_cardinality = None
        self.target_key = target_key
#         if target is not None: self.target_cardinality = len(df[target].unique()) - 1
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, i):
        vectors_to_concat = []
        for j, column in enumerate(self.cat):
            one_hot_vector = torch.zeros(self.cardinalities[j], dtype=torch.int)
            one_hot_vector[self.df[column][i] - 1] = 1 
            vectors_to_concat.append(one_hot_vector)
            
        for column in self.cont:
            vectors_to_concat.append(torch.tensor([self.df[column][i]]))
        
        input_vector = torch.cat(vectors_to_concat)
        
        if self.target_key is not None:
            target = torch.tensor([self.df[self.target_key][i]])
            return input_vector.float(), target
        
#         if self.target_cardinality is not None:
#             out_vector = torch.zeros(self.target_cardinality)
#             out_vector[self.df[self.target][i] - 1] = 1
#             return input_vector, out_vector
        
        return input_vector.float()

In [8]:
class LinearNet(nn.Module):
    def __init__(self, sizes):
        super(LinearNet, self).__init__()
        self.n_layers = len(sizes)-1
        self.linear = nn.ModuleList([nn.Linear(sizes[i], sizes[i+1]) for i in range(self.n_layers)])
        self.batchnorms = nn.ModuleList([nn.BatchNorm1d(sizes[i+1]) for i in range(self.n_layers)])
    
    def linear_block(self, x, idx):
        x = self.linear[idx](x)
        x = self.batchnorms[idx](x)
        if idx != self.n_layers-1:
            x = torch.sigmoid(x) 
        return  x
    
    def forward(self, x):
        for i in range(self.n_layers):
            x = self.linear_block(x,i)
        return x

In [9]:
def loss_func(predictions, targets):
    predictions = torch.sigmoid(predictions)
    return torch.where(targets==1, 1-predictions, predictions).mean()

In [10]:
def metric(predictions, targets):
    return 100*(((predictions > 0) == targets).count_nonzero() / len(predictions))

In [11]:
class TrainAndEvaluate():
    def __init__(self, model, optimizer, loss_func, metric, train_dataloader, valid_dataloader):
        self.model = model
        self.optimizer = optimizer
        self.loss_func = loss_func
        self.train_dl = train_dataloader
        self.valid_dl = valid_dataloader
        self.metric = metric
    
    def train_and_evaluate(self, n_epoch):
        torch.autograd.set_detect_anomaly(True)

        for epoch in range(n_epoch):
            kbar = pkbar.Kbar(target=(len(self.train_dl)+len(self.valid_dl)), epoch=epoch, num_epochs=n_epoch, width=16, stateful_metrics=['Validation Loss','Accuracy'])
            running_loss_t = 0.0
            running_loss_v = 0.0
            
            for i, b in enumerate(self.train_dl):
                input_vector = b[0]
                target = b[1]
                preds = self.model(input_vector)
                
                self.optimizer.zero_grad()
                loss = self.loss_func(preds, target)
                
                loss.backward() 
                self.optimizer.step()
                running_loss_t = loss.item()
                kbar.update(i, values=[("Train Loss", running_loss_t)])
            
            kbar.add(1, values=[("Validation Loss", 0), ("Accuracy", 0)])        
            with torch.no_grad():
                for j, b in enumerate(self.valid_dl):
                    input_vector = b[0]
                    target = b[1]

                    preds = self.model(input_vector)
                    loss = self.loss_func(preds, target)
                    running_loss_v = loss
                    accuracy = self.metric(preds, target)
                    kbar.update(i+j, values=[("Validation Loss", running_loss_v), ("Accuracy", accuracy)])
        print('Finished Training')
        return self.model

In [12]:
path = '/home/francisco/workspace/titanic_kaggle/titanic'

In [13]:
train_df = pd.read_csv(f'{path}/train.csv',low_memory=False)
test_df = pd.read_csv(f'{path}/test.csv',low_memory=False)

In [14]:
complete_df = pd.concat([train_df, test_df], ignore_index=True)

In [15]:
# complete_df = feature_engineer(complete_df)
# complete_df = complete_df.drop(['Ticket', 'Fare'], axis=1)

In [16]:
cont = ['Age', 'FarePerPerson', 'SibSp' , 'Parch']
cat = ['Survived', 'Pclass', 'Title', 'Sex', 'Deck', 'Embarked', 'AgeMissing','FarePerPersonMissing','DeckMissing', 'EmbarkedMissing']

In [17]:
complete_df = categorify(complete_df,['Title', 'Sex', 'Deck', 'Embarked'])
complete_df = fill_missing(complete_df, ['Survived'])
complete_df = normalize(complete_df,cont)

In [18]:
train_df = complete_df.iloc[:712]
valid_df = complete_df.iloc[712:891].reset_index()
test_df = complete_df.iloc[891:].reset_index()

In [19]:
# train_df = integerize(train_df, cat)
# valid_df = integerize(valid_df, cat)
# test_df = integerize(test_df, cat[1:])

In [20]:
train_dset = DatasetTabular(train_df, cont, cat[1:], 'Survived')
valid_dset = DatasetTabular(valid_df, cont, cat[1:], 'Survived')
test_dset = DatasetTabular(test_df, cont, cat[1:])

In [21]:
train_dl = DataLoader(train_dset, batch_size=128, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_dset, batch_size=179)

In [22]:
# net = LinearNet([46,128,128,128,128,64,64,32,32,16,16,8,8,2,2,1])
# net = LinearNet([46,46,32,32,32,16,16,16,8,8,8,4,4,4,2,2,1])
net = LinearNet([46,128,128,128,128,128,128,64,64,64,64,64,64,32,32,32,32,32,32,16,16,16,16,16,16,1])

In [25]:
# optimizer = optim.SGD(net.parameters(), lr=0.00001)
optimizer = optim.AdamW(net.parameters(), lr=0.00001, weight_decay=10e-4)
trainer = TrainAndEvaluate(net,optimizer,loss_func,metric,train_dl,valid_dl)

In [26]:
net = trainer.train_and_evaluate(100)

Epoch: 1/100


***

In [27]:
test_dl = DataLoader(test_dset,batch_size=418)

preds_test = net(next(iter(test_dl)))

preds_test[preds_test < 0] = 0
preds_test[preds_test > 0] = 1

preds_nn_df = pd.DataFrame()
preds_nn_df['PassengerId'] = test_df['PassengerId'].values
preds_nn_df['Survived'] = preds_test.int()

preds_nn_df

In [32]:
preds_nn_df.to_csv(f'{path}/submission_nn.csv',index=False)

***

In [54]:
preds_train = rf.predict(train_df.values[:,2:])

In [55]:
gt = train_df['Survived'].values

In [56]:
np.count_nonzero(preds_train == gt)/len(gt)

0.9876543209876543