## Library imports

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

# Load data

## Training set

In [6]:
train_data = pd.read_csv("../data/train.csv")
print(f"train_data columns: {train_data.columns}")
print(f"train_data shape: {train_data.shape}")
train_data.head()

train_data columns: Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')
train_data shape: (26729, 10)


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [7]:
train_data.describe()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A671945,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


## Test set

In [8]:
test_data = pd.read_csv("../data/test.csv")
print(f"test_data columns: {test_data.columns}")
print(f"test_data shape: {test_data.shape}")
test_data.head()

test_data columns: Index(['ID', 'Name', 'DateTime', 'AnimalType', 'SexuponOutcome',
       'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')
test_data shape: (11456, 8)


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [9]:
test_data.describe()

Unnamed: 0,ID
count,11456.0
mean,5728.5
std,3307.206676
min,1.0
25%,2864.75
50%,5728.5
75%,8592.25
max,11456.0


# Data exploration

In [10]:
# How balanced is the dataset
Counter(train_data['OutcomeType'])

Counter({'Return_to_owner': 4786,
         'Euthanasia': 1555,
         'Adoption': 10769,
         'Transfer': 9422,
         'Died': 197})

In [11]:
Counter(train_data['Name']).most_common(5)

[(nan, 7691), ('Max', 136), ('Bella', 135), ('Charlie', 107), ('Daisy', 106)]

# Data processing

In [12]:
X = train_data.drop(columns=['AnimalID', 'OutcomeType', 'OutcomeSubtype'])
Y = train_data['OutcomeType']
test_X = test_data

In [13]:
stacked_df = X.append(test_X.drop(columns=['ID']))
print(f'stacked_df shape: {stacked_df.shape}')

stacked_df shape: (38185, 7)


In [14]:
stacked_df['DateTime'] = pd.to_datetime(stacked_df['DateTime'])
stacked_df['year'] = stacked_df['DateTime'].dt.year
stacked_df['month'] = stacked_df['DateTime'].dt.month
stacked_df = stacked_df.drop(columns=['DateTime'])
stacked_df.head()

Unnamed: 0,Name,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month
0,Hambone,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,2014,2
1,Emily,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,2013,10
2,Pearce,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,2015,1
3,,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,2014,7
4,,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,2013,11


In [15]:
for col in stacked_df.columns:
    n_nulls = stacked_df[col].isnull().sum()
    if n_nulls > 10000:
        print(f"Drop column {col} with {n_nulls} nulls")
        stacked_df = stacked_df.drop(columns=[col])

Drop column Name with 10916 nulls


# NaN filling and data labeling

In [16]:
for col in stacked_df.columns:
    if stacked_df.dtypes[col] == 'object':
        stacked_df[col] = stacked_df[col].fillna('NA') 
    else:
        stacked_df[col] = stacked_df[col].fillna(0)
    stacked_df[col] =  LabelEncoder().fit_transform(stacked_df[col])

In [17]:
stacked_df.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month
0,1,3,5,1482,146,1,1
1,0,4,5,775,184,0,9
2,1,3,21,1293,97,2,0
3,0,1,26,775,47,1,6
4,1,3,21,1101,311,0,10


In [18]:
for col in stacked_df.columns:
    stacked_df[col] = stacked_df[col].astype('category')    

In [19]:
X = stacked_df[:train_data.shape[0]]
test_X = stacked_df[train_data.shape[0]:]
print(f'X shape: {X.shape}')
print(f'test_X shape: {test_X.shape}')

X shape: (26729, 7)
test_X shape: (11456, 7)


In [20]:
Y = LabelEncoder().fit_transform(Y)

print(Counter(train_data["OutcomeType"]))
print(Counter(Y))

Counter({'Adoption': 10769, 'Transfer': 9422, 'Return_to_owner': 4786, 'Euthanasia': 1555, 'Died': 197})
Counter({0: 10769, 4: 9422, 3: 4786, 2: 1555, 1: 197})


In [21]:
target_dict = {
    'Adoption': 0,
    'Died': 1,
    'Euthanasia': 2,
    'Return_to_owner': 3,
    'Transfer': 4,
}

In [22]:
train_X, valid_X, train_Y, valid_Y = train_test_split(X, Y, test_size=0.1, random_state=0)
print(train_X.shape[0], valid_X.shape[0])
print(train_Y.shape[0], valid_Y.shape[0])

24056 2673
24056 2673


# Choosing columns for embedding layers

In [23]:
embedded_cols = dict()
for name, col in X.items():
    n_categories = len(col.cat.categories)
    if n_categories > 2:
        embedded_cols[name] = n_categories

embedded_cols

{'SexuponOutcome': 6,
 'AgeuponOutcome': 46,
 'Breed': 1678,
 'Color': 411,
 'year': 4,
 'month': 12}

In [24]:
embedded_cols_names = embedded_cols.keys()
embedded_cols_names

dict_keys(['SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color', 'year', 'month'])

In [25]:
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _, n_categories in embedded_cols.items()]
embedding_sizes

[(6, 3), (46, 23), (1678, 50), (411, 50), (4, 2), (12, 6)]

# Pytorch dataset

In [26]:
class ShelterOutcomeDataset(Dataset):
    def __init__(self, X, Y, embedded_cols_names):
        self.X_cat = X.loc[:, embedded_cols_names].copy().values.astype(np.int64)
        self.X_num = X.drop(columns=embedded_cols_names).copy().values.astype(np.float32)
        self.Y = Y
    
    def __len__(self):
        return len(self.Y)

    def __getitem__(self, index):
        return self.X_cat[index], self.X_num[index], self.Y[index]

In [27]:
train_dataset = ShelterOutcomeDataset(train_X, train_Y, embedded_cols_names)
valid_dataset = ShelterOutcomeDataset(valid_X, valid_Y, embedded_cols_names)

# Making device (GPU/CPU) compatible

In [29]:
def get_default_device():
    '''Pick GPU if availabel, else GPU'''
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    '''Move tensor(s) to chosen device'''
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    else:
        return data.to(device, non_blocking=True)

In [30]:
device = get_default_device()
device

device(type='cpu')

# Pytorch dataloader

In [31]:
class DeviceDataloader():
    '''Wrap a dataloader to move data to a device'''
    def __init__(self, dataloader, device):
        self.dataloader = dataloader
        self.device = device

    def __iter__(self):
        '''Yield a batch of data after moving it to device'''
        for batch in self.dataloader:
            yield to_device(batch, self.device)

    def __len__(self):
        '''Number of batch'''
        return len(self.dataloader)

# Model

In [32]:
class ShelterOutcomeModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(n_categories, embedding_size) for n_categories, embedding_size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont = n_emb, n_cont
        self.n_classes = 5
        self.lin1 = nn.Linear(self.n_emb + n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, self.n_classes)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)
    
    def forward(self, x_cat, x_cont):
        x_cat = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, 1)
        x_cat = self.emb_drop(x_cat)
        x_cont = self.bn1(x_cont)
        x = torch.cat([x_cat, x_cont], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)
        return x     

In [33]:
n_cont = len(X.columns) - len(embedded_cols_names)
model = ShelterOutcomeModel(embedding_sizes, n_cont)
to_device(model, device)

ShelterOutcomeModel(
  (embeddings): ModuleList(
    (0): Embedding(6, 3)
    (1): Embedding(46, 23)
    (2): Embedding(1678, 50)
    (3): Embedding(411, 50)
    (4): Embedding(4, 2)
    (5): Embedding(12, 6)
  )
  (lin1): Linear(in_features=135, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=5, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

# Optimizer

In [34]:
def get_optimizer(model, lr=0.001, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

# Training function

In [35]:
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x_cat, x_cont, y in train_dl:
        current_batch_size = y.shape[0]
        output = model(x_cat, x_cont)
        y  = torch.tensor(y, dtype=torch.long)
        loss = F.cross_entropy(output, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += current_batch_size
        sum_loss += current_batch_size * loss.item()
    return sum_loss/total

# Evaluating function

In [51]:
def val_loss(model, valid_dl):
    model.eval()
    with torch.no_grad():
        total = 0
        sum_loss = 0
        correct = 0
        for x_cat, x_cont, y in valid_dl:
            current_batch_size = y.shape[0]
            output = model(x_cat, x_cont)
            y = torch.tensor(y, dtype=torch.long)
            loss = F.cross_entropy(output, y)
            total += current_batch_size
            sum_loss += current_batch_size * loss.item()
            # # max_values, max_indicies = torch.max(output, 1)
            # pred = torch.max(output, 1)[1]
            # correct += (pred == y).float().sum().item()
            pred = torch.argmax(output, dim=1)
            correct += torch.sum(pred == y).item()
        print(f"Valid loss {sum_loss / total:.3} and accuracy {correct / total:.3}")
    return sum_loss / total, correct / total

In [52]:
def train_loop(model, train_dataset, valid_dataset, epochs, lr=0.01, wd=0.0):
    batch_size = 1000
    train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size, shuffle=True)
    device = get_default_device()
    train_dl = DeviceDataloader(train_dataloader, device)
    valid_dl = DeviceDataloader(valid_dataloader, device)
    
    optim = get_optimizer(model, lr=lr, wd=wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dataloader)
        print("Training loss: ", loss)
        val_loss(model, valid_dataloader)


In [53]:
train_loop(model, train_dataset, valid_dataset, epochs=20, lr=0.03, wd=0.00001)

  


Training loss:  0.9170265281006154


  # Remove the CWD from sys.path while we load stuff.


Valid loss 0.857 and accuracy 0.642
Training loss:  0.9080755177985642
Valid loss 0.863 and accuracy 0.634
Training loss:  0.8964879735387518
Valid loss 0.868 and accuracy 0.642
Training loss:  0.8947392202192738
Valid loss 0.869 and accuracy 0.644
Training loss:  0.891844088903566
Valid loss 0.878 and accuracy 0.631
Training loss:  0.8831603722117214
Valid loss 0.868 and accuracy 0.64
Training loss:  0.8819056694437033
Valid loss 0.883 and accuracy 0.633
Training loss:  0.8891605659501672
Valid loss 0.881 and accuracy 0.636
Training loss:  0.8871113227814743
Valid loss 0.884 and accuracy 0.632
Training loss:  0.8916077693118422
Valid loss 0.874 and accuracy 0.64
Training loss:  0.8852981061169504
Valid loss 0.884 and accuracy 0.634
Training loss:  0.8858361944511637
Valid loss 0.88 and accuracy 0.64
Training loss:  0.8801191471564164
Valid loss 0.887 and accuracy 0.623
Training loss:  0.8928388564500216
Valid loss 0.89 and accuracy 0.637
Training loss:  0.8873886001201258
Valid loss 0

# Test output

In [None]:
batch_size = 1000
test_dataset = ShelterOutcomeDataset(test_X, np.zeros(len(test_X)), embedded_cols_names)
test_dataloader = DataLoader(test_dataset, batch_size)

In [None]:
preds = list()
with torch.no_grad():
    for x_cat, x_cont, y in test_dataloader:
        output = model(x_cat, x_cont)
        prob = F.softmax(output, dim=1)
        preds.append(prob)

final_probs = [item for sublist in preds for item in sublist]
len(final_probs)


11456

In [None]:
submission = pd.DataFrame(columns=['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
submission['ID'] = list(range(1, len(final_probs) + 1))
submission['Adoption'] = [float(t[0]) for t in final_probs]
submission['Died'] = [float(t[1]) for t in final_probs]
submission['Euthanasia'] = [float(t[2]) for t in final_probs]
submission['Return_to_owner'] = [float(t[3]) for t in final_probs]
submission['Transfer'] = [float(t[4]) for t in final_probs]
submission.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.04608,0.014109,0.069781,0.077125,0.792905
1,2,0.602428,0.001628,0.026075,0.248035,0.121835
2,3,0.358552,0.008087,0.051758,0.149518,0.432084
3,4,0.032967,0.010579,0.025094,0.014811,0.916549
4,5,0.479122,0.003006,0.026954,0.289079,0.201839


In [None]:
submission.to_csv('submission.csv', index=False)