<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/RankingTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
! nvidia-smi

Tue Mar  9 03:31:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    42W / 300W |   1325MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [30]:
# Basics
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch
import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import quantile_transform

# Scipy
from scipy import stats

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
manual_seed = 2357

def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

Deterministic experiment, seed: 2357


In [32]:
dataset_path = "/content/drive/MyDrive/Colab Notebooks/titanic3.csv"
df = pd.read_csv(dataset_path)
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.00,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.00,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.50,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.50,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.00,0,0,2670,7.2250,,C,,,


In [33]:
df['pclass'] = df['pclass'] - 1
df['pclass'].value_counts()

2    709
0    323
1    277
Name: pclass, dtype: int64

In [34]:
df.iloc[:, 3:]

Unnamed: 0,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,female,29.00,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,male,0.92,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,female,2.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,male,30.00,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,female,25.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...
1304,female,14.50,1,0,2665,14.4542,,C,,328.0,
1305,female,,1,0,2665,14.4542,,C,,,
1306,male,26.50,0,0,2656,7.2250,,C,,304.0,
1307,male,27.00,0,0,2670,7.2250,,C,,,


In [35]:
df_train, df_test, y_train, y_test = train_test_split(df.iloc[:, 3:],df['pclass'], test_size=0.3, stratify=df['pclass'])

# volta para dataframe
df_train = pd.DataFrame(df_train)
df_test = pd.DataFrame(df_test)
df_train.shape, y_train.shape, df_test.shape, y_test.shape

((916, 11), (916,), (393, 11), (393,))

In [36]:
columns = df_train.columns.to_list()
cat_feats = []

for col in columns:
    if df_train[col].nunique() > 1 and df_train[col].nunique() < 100:
        cat_feats.append(col)
        print(col)
    else:
        print(f'Column: {col} is a num_feat, nunique: {df_train[col].nunique()}')

num_feats = list(set(columns) - set(cat_feats))

assert set(columns) == set(cat_feats + num_feats)

print(f'len(num_feats): {len(num_feats)}, len(cat_feats): {len(cat_feats)}')

sex
age
sibsp
parch
Column: ticket is a num_feat, nunique: 696
Column: fare is a num_feat, nunique: 246
Column: cabin is a num_feat, nunique: 144
embarked
boat
body
Column: home.dest is a num_feat, nunique: 285
len(num_feats): 4, len(cat_feats): 7


In [37]:
columns_num = df_train[num_feats].columns.to_list() 
col_types = df_train[num_feats].dtypes.to_list()

num_feats_number, num_feats_obj = [], []
for e in zip(columns_num, col_types):
    if e[1] == 'object':
        num_feats_obj.append(e[0])
    else:
        num_feats_number.append(e[0])

print(num_feats_obj)
print(num_feats_number)

df_train[num_feats] = df_train[num_feats].fillna(-10)
df_train[cat_feats] = df_train[cat_feats].fillna('?')

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for num_obj in num_feats_obj:
    df_train[num_obj] = df_train[num_obj].astype(str)
    df_train[num_obj] = le.fit_transform(df_train[num_obj])

# =====================================================

df_test[num_feats] = df_test[num_feats].fillna(-10)
df_test[cat_feats] = df_test[cat_feats].fillna('?')

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
for num_obj in num_feats_obj:
    df_test[num_obj] = df_test[num_obj].astype(str)
    df_test[num_obj] = le.fit_transform(df_test[num_obj])

df_train[num_feats + cat_feats]   

['cabin', 'ticket', 'home.dest']
['fare']


Unnamed: 0,cabin,ticket,home.dest,fare,sex,age,sibsp,parch,embarked,boat,body
1179,0,578,0,69.5500,male,?,1,9,S,?,?
1150,0,433,0,14.5000,male,?,0,0,S,?,?
1201,0,201,0,7.2292,male,22,0,0,C,?,?
307,96,422,41,77.2875,male,21,0,1,S,?,169
102,68,57,178,83.1583,female,23,0,1,C,7,?
...,...,...,...,...,...,...,...,...,...,...,...
135,11,618,182,34.6542,male,71,0,0,C,?,?
551,0,687,159,10.5000,female,50,0,0,S,13,?
1161,0,528,0,8.0500,male,16,0,0,S,?,?
850,0,404,0,7.8542,male,21,0,0,S,?,69


In [38]:
def map_categorical_train(df, cat_cols):
    cat_dims, list_of_dicts = [], []
    
    for col in cat_cols:
        j = 1
        my_dict = {}
        my_dict.update({' ?':0})
        my_dict.update({'UNK':1})

        for e in df[col].value_counts().index.to_list():
            if e not in my_dict.keys():
                j +=1
                my_dict.update({e:j})
        list_of_dicts.append(my_dict)
        cat_dims.append(len(my_dict))

        df[col] = df[col].apply(lambda x: my_dict[x] if x in 
                                my_dict.keys() else my_dict['UNK'])
        
        emb_szs = [(c, min(100, (c+10)//2)) for c in cat_dims]
        cat_emb_dims = [e[0] for e in emb_szs]

    return df, list_of_dicts, emb_szs

#----------------------------------------------------------------------------------
def map_categorical_test(df_test, list_of_dicts, cat_cols):
    df = df_test.copy()
    for i, col in enumerate(cat_cols):
        df[col] = df[col].apply(lambda x: list_of_dicts[i][x] if x in 
                                list_of_dicts[i].keys() else list_of_dicts[i]['UNK'])
    return df

In [39]:
df_train_, list_of_dicts, emb_szs = map_categorical_train(df_train, cat_feats)   
df_test_ = map_categorical_test(df_test, list_of_dicts, cat_feats)   
print(df_train_.shape, df_test_.shape)
df_train_

(916, 11) (393, 11)


Unnamed: 0,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1179,2,2,3,8,578,69.5500,0,2,2,2,0
1150,2,2,2,2,433,14.5000,0,2,2,2,0
1201,2,4,2,2,201,7.2292,0,3,2,2,0
307,2,5,2,3,422,77.2875,96,2,2,12,41
102,3,12,2,3,57,83.1583,68,3,11,2,178
...,...,...,...,...,...,...,...,...,...,...,...
135,2,71,2,2,618,34.6542,11,3,2,2,182
551,3,32,2,2,687,10.5000,0,2,3,2,159
1161,2,22,2,2,528,8.0500,0,2,2,2,0
850,2,5,2,2,404,7.8542,0,2,2,4,0


In [40]:
def normalize(df, num_cols, cat_cols):
    data = df.copy()
    df_num = df[num_cols]
    
    quantile = quantile_transform(df_num.values, 
                                  random_state=2357, 
                                  copy=True,
                                  n_quantiles=300,  
                                  output_distribution='uniform')
    df_norm  = pd.DataFrame(data=quantile, 
                            columns=df_num.columns, 
                            index=df_num.index)

    df= pd.concat((
        df_norm, 
        df[cat_cols]), 
        axis=1)
    return df

In [41]:
df_train = normalize(df_train_, num_feats, cat_feats)
df_test  = normalize(df_test_, num_feats, cat_feats)
df_train 

Unnamed: 0,cabin,ticket,home.dest,fare,sex,age,sibsp,parch,embarked,boat,body
1179,0.000000,0.812709,0.000000,0.869565,2,2,3,8,2,2,2
1150,0.000000,0.618579,0.000000,0.510033,2,2,2,2,2,2,2
1201,0.000000,0.312295,0.000000,0.065217,2,4,2,2,3,2,2
307,0.928866,0.605464,0.513851,0.891151,2,5,2,3,2,2,12
102,0.884655,0.086957,0.751328,0.916388,3,12,2,3,3,11,2
...,...,...,...,...,...,...,...,...,...,...,...
135,0.793443,0.881967,0.782609,0.766107,2,71,2,2,3,2,2
551,0.000000,0.984699,0.710702,0.396321,3,32,2,2,2,3,2
1161,0.000000,0.738350,0.000000,0.301003,2,22,2,2,2,2,2
850,0.000000,0.583607,0.000000,0.209030,2,5,2,2,2,2,4


In [42]:
df_train['TARGET'] = y_train
df_test['TARGET'] = y_test

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train

Unnamed: 0,cabin,ticket,home.dest,fare,sex,age,sibsp,parch,embarked,boat,body,TARGET
0,0.000000,0.812709,0.000000,0.869565,2,2,3,8,2,2,2,2
1,0.000000,0.618579,0.000000,0.510033,2,2,2,2,2,2,2,2
2,0.000000,0.312295,0.000000,0.065217,2,4,2,2,3,2,2,2
3,0.928866,0.605464,0.513851,0.891151,2,5,2,3,2,2,12,0
4,0.884655,0.086957,0.751328,0.916388,3,12,2,3,3,11,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.793443,0.881967,0.782609,0.766107,2,71,2,2,3,2,2,0
912,0.000000,0.984699,0.710702,0.396321,3,32,2,2,2,3,2,1
913,0.000000,0.738350,0.000000,0.301003,2,22,2,2,2,2,2,2
914,0.000000,0.583607,0.000000,0.209030,2,5,2,2,2,2,4,2


In [43]:
qid_train = []
for i, label in enumerate(df_train['TARGET'].to_list()):
    qid_train.append('QID_'+str(i)+'_'+str(label))
df_train['QID'] = qid_train

# ======================================================= #
qid_test = []
for i, label in enumerate(df_test['TARGET'].to_list()):
    qid_test.append('QID_'+str(i)+'_'+str(label))
df_test['QID'] = qid_test

df_train

Unnamed: 0,cabin,ticket,home.dest,fare,sex,age,sibsp,parch,embarked,boat,body,TARGET,QID
0,0.000000,0.812709,0.000000,0.869565,2,2,3,8,2,2,2,2,QID_0_2
1,0.000000,0.618579,0.000000,0.510033,2,2,2,2,2,2,2,2,QID_1_2
2,0.000000,0.312295,0.000000,0.065217,2,4,2,2,3,2,2,2,QID_2_2
3,0.928866,0.605464,0.513851,0.891151,2,5,2,3,2,2,12,0,QID_3_0
4,0.884655,0.086957,0.751328,0.916388,3,12,2,3,3,11,2,0,QID_4_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.793443,0.881967,0.782609,0.766107,2,71,2,2,3,2,2,0,QID_911_0
912,0.000000,0.984699,0.710702,0.396321,3,32,2,2,2,3,2,1,QID_912_1
913,0.000000,0.738350,0.000000,0.301003,2,22,2,2,2,2,2,2,QID_913_2
914,0.000000,0.583607,0.000000,0.209030,2,5,2,2,2,2,4,2,QID_914_2


In [44]:
def make_dataset_list(df):
    df_class_0 = df[df.TARGET == 0]
    df_class_1 = df[df.TARGET == 1]
    df_class_2 = df[df.TARGET == 2]

    dataset_list = []
    for i, row_data in enumerate(df.iterrows()):
        if row_data[1].TARGET == 0:
            cand_1 = list(np.random.choice(df_class_1.QID, 1, replace=False))
            cand_2 = list(np.random.choice(df_class_2.QID, 1, replace=False))
            dataset_list.append([row_data[1].QID, cand_1+cand_2])
        if row_data[1].TARGET == 1:
            cand_1 = list(np.random.choice(df_class_0.QID, 1, replace=False))
            cand_2 = list(np.random.choice(df_class_2.QID, 1, replace=False))
            dataset_list.append([row_data[1].QID, cand_1+cand_2])

        if row_data[1].TARGET == 2:
            cand_1 = list(np.random.choice(df_class_0.QID, 1, replace=False))
            cand_2 = list(np.random.choice(df_class_1.QID, 1, replace=False))
            dataset_list.append([row_data[1].QID, cand_1+cand_2])

    return dataset_list

dataset_list_train = make_dataset_list(df_train)
dataset_list_test = make_dataset_list(df_test)

len(dataset_list_train), len(dataset_list_test)

(916, 393)

In [62]:
class RankingDataset(Dataset):
    def __init__(self, df, dataset_list, num_cols, cat_cols, target):
        super().__init__()

        self.df = df
        self.dataset_list = dataset_list
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        
        self.y = target.values.reshape(-1).copy().astype(np.int64)
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):

        x_cont = self.df[self.num_cols][
                    self.df.QID == self.dataset_list[index][0]].values.copy().astype(np.float32)
        x_cat  = self.df[self.cat_cols][
                    self.df.QID == self.dataset_list[index][0]].values.copy().astype(np.int64)

        x_cont_cands = self.df[self.num_cols][
                          self.df.QID.isin(self.dataset_list[index][1])].values.copy().astype(np.float32)
        x_cat_cands  = self.df[self.cat_cols][
                          self.df.QID.isin(self.dataset_list[index][1])].values.copy().astype(np.int64)

        y = self.y[index]
        
        return np.squeeze(x_cont, axis=0), np.squeeze(x_cat, axis=0), x_cont_cands, x_cat_cands, y

# testando
ds = RankingDataset(
    df_train, 
    dataset_list_train,
    num_feats, 
    cat_feats, 
    df_train['TARGET']
    )
x_cont, x_cat, x_cont_cands, x_cat_cands, y = ds[0]
x_cont.shape, x_cat.shape, x_cont_cands.shape, x_cat_cands.shape, y

((4,), (7,), (2, 4), (2, 7), 2)

In [63]:
BATCH_SZ = 32

# datasets
ds_train = RankingDataset(
    df_train, 
    dataset_list_train,
    num_feats, 
    cat_feats, 
    df_train['TARGET']
    )

ds_test = RankingDataset(
    df_test, 
    dataset_list_test,
    num_feats, 
    cat_feats, 
    df_test['TARGET']
    )

# dataloaders
dataloaders = {
     'train': DataLoader(ds_train,
                         batch_size=BATCH_SZ,
                         shuffle = True, 
                         num_workers=2,
                         pin_memory=True),
     'test' : DataLoader(ds_test,
                         batch_size=BATCH_SZ,
                         shuffle=False,
                         num_workers=2,
                         pin_memory=True),
     }

# sanity check
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}
dl_sizes 

{'test': 13, 'train': 29}

In [64]:
x_cont, x_cat, x_cont_cands, x_cat_cands, y = next(iter(dataloaders['train']))
x_cont.shape, x_cat.shape, x_cont_cands.shape, x_cat_cands.shape, y.shape

(torch.Size([32, 4]),
 torch.Size([32, 7]),
 torch.Size([32, 2, 4]),
 torch.Size([32, 2, 7]),
 torch.Size([32]))

In [65]:
len(cat_feats), len(emb_szs)
emb_szs

[(4, 7), (96, 53), (9, 9), (10, 10), (6, 8), (29, 19), (89, 49)]

In [93]:
class RankingModel(nn.Module):
    def __init__(self, embedding_sizes=emb_szs, n_cont=len(num_feats), out=3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) 
                    for categories,size in embedding_sizes])
        
        n_emb_sum = sum(e.embedding_dim for e in self.embeddings) 
        d_model = n_emb_sum + n_cont

        self.actv = nn.ReLU()

        self.fc0 = nn.Linear(d_model, n_emb_sum)
        self.fc1 = nn.Linear(n_emb_sum, out)

    def forward(self, x_cont, x_cat, x_cont_cand, x_cat_cand):
        xcat = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
        xcat = torch.cat(xcat, dim=1) 
        
        pos = torch.cat((x_cont, xcat), dim=1)
        pos = self.fc0(pos)
        pos = self.actv(pos)
        pos = self.fc1(pos)

        neg_samples = []
        for j in range(x_cont_cands.shape[1]):
            x_cat_cand_ = x_cat_cand[:, j, :]
            x_cont_cand_ = x_cont_cand[:, j, :]

            xcat_cand_ = [e(x_cat_cand_[:,i]) for i, e in enumerate(self.embeddings)]
            xcat_cand_ = torch.cat(xcat_cand_, dim=1) 
            
            neg = torch.cat((x_cont_cand_, xcat_cand_), dim=1)
            neg = self.fc0(neg)
            neg = self.actv(neg)
            neg = self.fc1(neg)
            neg_samples.append(neg)

        return pos, neg_samples
#--------------------------------------------------------------------------------
model = RankingModel(
    embedding_sizes=emb_szs, 
    n_cont=len(num_feats), 
    )
with torch.no_grad():
    outs = model(x_cont, x_cat, x_cont_cands, x_cat_cands)   

outs[0].shape, outs[1][0].shape

(torch.Size([32, 3]), torch.Size([32, 3]))

In [95]:
outs[1][1].shape

torch.Size([32, 3])