In [1]:
from fastai.tabular.all import *
from fastcore.utils import *
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

df_nn = pd.read_csv('train.csv', low_memory=False)
df_nn_final = df_nn.drop('id', axis=1)

"""
Categorical embedding
"""

cont,cat = cont_cat_split(df_nn_final, max_card=9000, dep_var='target')
procs_nn = [Categorify, Normalize]
splits = RandomSplitter(seed=23)(df_nn_final)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

to_nn = TabularPandas(df_nn, procs_nn, cat, cont,
                      splits=splits, y_names='target')
dls = to_nn.dataloaders(1024, device = device)

learn = tabular_learner(dls, layers=[500,250], n_out=1)
learn.fit_one_cycle(8, 5e-4)

preds,targs = learn.get_preds()
roc_auc_score(targs, preds)

learn.save('learn8')

# Machine Learning Models
df = pd.read_csv('train.csv', low_memory=False)

# using the neural net's `cat`, `cont`, and `splits`
procs = [Categorify]
to = TabularPandas(df, procs, cat, cont, 'target', splits = splits)

def rf(xs, y, n_estimators=40, max_samples=130_000,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf).fit(xs, y)

def auc(m, xs, y):
    preds = m.predict(xs)
    return round(roc_auc_score(y, preds), 3)

# Replacing Nominal variables with Embeddings
learn = learn.load('learn8')

def embed_features(learner, xs):
    """
    learner: fastai Learner used to train the neural net
    xs: DataFrame containing input variables with nominal values defined by their rank.
    ::returns:: a copy of `xs` with embeddings replacing each categorical variable
    """
    xs = xs.copy()
    for i,col in enumerate(learn.dls.cat_names):
        emb = learn.model.embeds[i]
        emb_data = emb(tensor(xs[col], dtype=torch.int64).to(device))
        emb_names = [f'{col}_{j}' for j in range(emb_data.shape[1])]
        feat_df = pd.DataFrame(data=emb_data, index=xs.index, columns=emb_names)
        xs = xs.drop(col, axis=1)
        xs = xs.join(feat_df)
        return xs
    
emb_xs = embed_features(learn, to.train.xs)
emb_valid_xs = embed_features(learn, to.valid.xs)
emb_valid_xs

epoch,train_loss,valid_loss,time
0,0.145474,0.131971,00:17
1,0.119979,0.117684,00:17
2,0.115985,0.115925,00:18
3,0.112822,0.113276,00:17
4,0.111047,0.112324,00:17
5,0.108471,0.111757,00:18
6,0.10697,0.110622,00:16
7,0.105566,0.11041,00:17


Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cat0_0,cat0_1,cat0_2
277491,6,4,2,7,34,1,31,60,1,148,...,0.292775,0.210421,0.335680,0.508110,0.375804,0.485737,0.388535,0.056746,0.042776,0.048583
59826,11,1,1,13,34,3,20,4,13,258,...,0.717059,0.555945,0.253756,0.810781,0.558822,0.821682,0.950494,0.056746,0.042776,0.048583
100532,6,10,1,5,34,3,1,36,1,76,...,0.300573,0.807069,0.559432,0.310892,0.414348,0.328559,0.198210,-0.054680,-0.078122,-0.041497
127444,8,13,1,6,3,1,9,60,1,148,...,0.260020,0.762579,0.929469,0.368868,0.400274,0.582639,0.436348,0.056746,0.042776,0.048583
268466,13,1,2,5,34,1,9,52,5,160,...,0.292645,0.487171,0.581556,0.332349,0.366024,0.374952,0.382322,-0.054680,-0.078122,-0.041497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292950,8,13,1,6,3,5,47,51,1,175,...,0.310330,0.354413,0.335381,0.433126,0.758386,0.438863,0.488459,0.056746,0.042776,0.048583
235306,6,3,1,5,34,3,20,11,1,55,...,0.731341,0.734535,0.548252,0.304629,0.338420,0.321991,0.339444,-0.054680,-0.078122,-0.041497
187628,15,3,1,6,34,1,37,34,1,148,...,0.278301,0.735457,0.787859,0.313780,0.700493,0.799430,0.464530,0.056746,0.042776,0.048583
200243,6,1,2,5,3,1,10,47,5,175,...,0.247176,0.052474,0.172384,0.552959,0.405473,0.266106,0.531018,0.056746,0.042776,0.048583


In [5]:
from torch.utils.data import Dataset
from sklearn.cluster import KMeans

import torch
import torch.nn
import torch.nn.functional as F
import torch.optim as opt

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from utils import *

train_loader, valid_loader = pandas_to_tensor(df, emb_xs, emb_valid_xs)

nb_clust = 1

# -1 cause we remove the target
net = Net(df.shape[1], nb_clust)
print(net)

max_epochs = 100
#loss_fct = nn.CrossEntropyLoss()
loss_fct = nn.BCEWithLogitsLoss()
l_loss = list()
l_loss_test = list()
l_roc_train = list()
l_roc_test = list()

optim = opt.Adam(net.parameters(), lr=0.01)


for epoch in tqdm(range(max_epochs)):
    t0 = datetime.now()
    net.train()
    for batch, (x, y) in enumerate(train_loader):
        
        optim.zero_grad()
        
        # Predict soft-targets and embeddings
        proba, output = net(x)
        
        loss = loss_fct(proba, y)
        loss.backward()
        
        optim.step()
        
        l_loss.append(loss.item())
        l_roc_train.append(roc_auc_score(y.detach().numpy(), proba.detach().numpy()))
        
with torch.no_grad():
    for batch, (x, y) in enumerate(valid_loader):
        proba, output = net(x)

        loss = loss_fct(proba, y)
        l_loss_test.append(loss)
        
        l_roc_test.append(roc_auc_score(y.detach().numpy(), proba.detach().numpy()))
        
print_scores(l_loss, l_roc_train, l_roc_test, l_loss_test)

Net(
  (fc1): Linear(in_features=32, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=1, bias=True)
  (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


100%|█████████████████████████████████████████| 100/100 [05:38<00:00,  3.39s/it]


TypeError: print_scores() missing 2 required positional arguments: 'l_roc_test' and 'l_loss_test'