In [1]:
from exp.nb_01 import *

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
path_cat=Path(r'D:\datasets\cat-in-the-dat\train.csv')

In [62]:
#export
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [9]:
df=pd.read_csv(path_cat, low_memory=False)
df.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
bin_0,0,0,0,0,0
bin_1,0,1,0,1,0
bin_2,0,0,0,0,0
bin_3,T,T,F,F,F
bin_4,Y,Y,Y,Y,N
nom_0,Green,Green,Blue,Red,Red
nom_1,Triangle,Trapezoid,Trapezoid,Trapezoid,Trapezoid
nom_2,Snake,Hamster,Lion,Snake,Lion
nom_3,Finland,Russia,Russia,Canada,Canada


In [38]:
df.shape

(300000, 25)

In [11]:
cols=set(df.columns)
train_cols=cols-set('target')

In [25]:
Y = df["target"].values.astype(np.float32)
X = df.drop(columns=["target", 'id'])

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
for col in X.columns:
    if X.dtypes[col] == "object":
        X[col] = X[col].fillna("NA")
    else:
        X[col] = X[col].fillna(0)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [27]:
for col in X.columns:
    X[col] = X[col].astype('category')

In [28]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=3)
X_train.head().T

Unnamed: 0,247601,225468,276268,148600,221915
bin_0,0,0,0,0,0
bin_1,0,1,0,0,0
bin_2,1,0,0,0,0
bin_3,0,1,0,1,1
bin_4,1,1,0,1,1
nom_0,0,0,0,0,0
nom_1,5,2,5,0,2
nom_2,5,3,4,5,3
nom_3,5,0,1,5,4
nom_4,1,0,1,1,2


In [29]:
emb_c = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories) > 2}
emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
emb_cols = emb_c.keys()

In [30]:
class Cat_Dataset(Dataset):
    def __init__(self, X, Y, emb_cols):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64)
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32)
        self.emb_szs = emb_szs
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X1[idx], self.X2[idx], self.y[idx]]

In [31]:
train_ds = Cat_Dataset(X_train, y_train, emb_cols)
valid_ds = Cat_Dataset(X_val, y_val, emb_cols)

In [33]:
train_ds[0]

[array([   0,    5,    5,    5,    1,  186,   53,  703,  625, 8971,    0,
           2,    4,   11,   22,   59,    1,    0], dtype=int64),
 array([0., 0., 1., 0., 1.], dtype=float32),
 1.0]

In [34]:
from torch.nn.init import kaiming_uniform, kaiming_normal

In [35]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        n_emb = sum(e.embedding_dim for e in self.embs) 
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 100)
        self.lin2 = nn.Linear(100, 1)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(100)
        self.emb_drop = nn.Dropout(0.5)
        self.drops = nn.Dropout(0.2)
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = self.lin2(x)
        return x

In [36]:
model = MixedInputModel(emb_szs, 172)

In [48]:
def accuracy(out, yb): return F.binary_cross_entropy_with_logits(out, yb)

In [54]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [64]:
def train_model(model, optim, train_dl=train_dl, verbose=False):
    model.train()
    total = 0
    sum_loss = 0
    for i, (x1, x2, y) in enumerate(train_dl):
        batch = y.shape[0]
        x1 = x1.cuda()
        x2 = x2.cuda()
        y = y.cuda().unsqueeze(1)
        
        out = model(x1, x2)
        loss = F.binary_cross_entropy_with_logits(out, y)
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.data[0])
        if verbose: print(sum_loss/total)
    return sum_loss/total

In [63]:
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for i, (x1, x2, y) in enumerate(valid_dl):
        batch = y.shape[0]
        x1 = x1.cuda()
        x2 = x2.cuda()
        y = y.cuda().unsqueeze(1)
        
        out = model(x1, x2)
        loss = F.binary_cross_entropy_with_logits(out, y)
        sum_loss += batch*(loss.data[0])
        total += batch
        pred = (out > 0).float()
        correct += (pred == y).float().sum().data[0]
    print("val loss", sum_loss/total, correct/total)
    return sum_loss/total, correct/total

In [57]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("loss ", loss)
        val_loss(model, valid_dl)

In [58]:
batch_size = 500
train_dl = DataLoader(train_ds, batch_size=batch_size)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [66]:
model = MixedInputModel(emb_szs, 5).cuda()

In [67]:
train_loop(model, epochs=10, lr=0.05, wd=0.00001)

IndexError: invalid index of a 0-dim tensor. Use tensor.item() to convert a 0-dim tensor to a Python number