<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/anchorRanking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Tue Aug  3 02:30:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    41W / 300W |   1681MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install magic_timer

# basics imports
import gc
import os
import pickle
import random
import numpy as np
import pandas as pd
from magic_timer import MagicTimer
 
# sklearn imports
from sklearn.metrics import classification_report, f1_score
 
# torch imports
import torch 
from torch.utils.data import DataLoader

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
manual_seed = 341
 
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')
 
deterministic()

Deterministic experiment, seed: 341


# Load DataPrep and objects

In [None]:
def pickle_file(path, data=None):
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
path_load = '/content/drive/MyDrive/Colab/AVI-PROATIVO/avi_proativo_modelo/EXPERIMENTOS/'
 
# Loading
similar_ids = pickle_file(path_load+'similar_ids')
X_train = pickle_file(path_load+'X_train_transform')
y_train = pickle_file(path_load+'y_train')
X_test  = pickle_file(path_load+'X_test_transform')
y_test  = pickle_file(path_load+'y_test')

# X_train = pickle_file(path_load+'X_train_transform_no_scale')
# y_train = pickle_file(path_load+'y_train_no_scale')
# X_test  = pickle_file(path_load+'X_test_transform_no_scale')
# y_test  = pickle_file(path_load+'y_test_no_scale')
data_feats = pickle_file(path_load+'data_feats')

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

int_to_class = {k:v for v,k in data_feats['assuntos'].items()}
print(int_to_class)

X_train

{0: 'cancelamento do cartão de crédito', 1: 'avaliação emergencial de crédito', 2: 'contestar transações', 3: 'desbloquear o cartão', 4: 'rotativo - pagamento mínimo', 5: 'alterar limite do cartão de crédito', 6: 'renegociação - palavra curta'}


Unnamed: 0,aut_val_orgl_tran_tr,aut_transacao_min,aut_transacao_max,aut_limite_restante_min_perc,aut_limite_restante_max_perc,aut_limite_restante_min,aut_limite_restante_max,aut_dat_vld_car_flag,aut_dat_vld_car_vencer,aut_dt_venc_fat_vencer,dmcc_qtd_cartao_adicional,dmcc_valor_anuidade,dmcc_qtd_compras_nacional,dmcc_qtd_compras_internacional,dmcc_flag_cred_pessoal,dmcc_flag_overlimit,dmcc_indicador_parcelamento_fatura,dmcc_regiao_cobranca_max,dmcc_indicador_acordo_fatura,dmcc_tp_alteracao_limite_0,dmcc_tp_alteracao_limite_2,dmcc_tp_alteracao_limite_A,dmcc_tp_alteracao_limite_B,dmcc_tp_alteracao_limite_C,dmcc_tp_alteracao_limite_D,clie_corn_qtd_anos,clie_corn_dias_ulti_alter_conta,clie_corn_alter_aber_conta,avi_qtd_transbordo,avi_intencao_mais_retornada,avi_ultima_intencao_retornada
0,2.408347,-0.546608,2.397090,-0.921129,-2.106858,-0.057246,-0.164726,0.0,0.095356,0.717443,-0.677002,-0.649573,0.659546,7.757572,0.0,4,0.0,21,0.0,1,0,0,0,0,0,0.939009,0.098186,-0.531483,-0.524185,3,3
1,-0.803916,-0.017570,-0.893318,-0.177138,0.595664,-0.055954,-0.045087,0.0,-1.763641,-1.263103,-0.677002,-0.649573,1.741463,-0.128906,0.0,4,0.0,3,0.0,0,0,1,0,0,0,1.208192,0.925118,-0.200323,-0.524185,3,3
2,-0.840751,-0.546608,-1.001082,-0.921129,-1.339536,-0.057246,3.072864,0.0,0.166043,0.717443,-0.677002,-0.649573,0.027858,-0.128906,1.0,4,0.0,3,0.0,1,0,0,0,0,0,-0.972791,-0.842987,-0.585670,-0.524185,3,3
3,1.540535,2.024564,0.745900,0.724564,-0.206244,0.015782,-0.103079,0.0,-0.120239,0.717443,-0.677002,1.379158,0.451179,-0.128906,0.0,4,0.0,5,0.0,0,0,0,0,1,0,-0.972791,-0.842987,-0.585670,-0.524185,0,0
4,-0.374214,1.304779,-0.593385,0.397964,0.244924,-0.003548,-0.051588,0.0,0.963426,-0.936822,-0.677002,-0.649573,0.112031,-0.128906,0.0,4,0.0,5,0.0,1,0,1,0,0,0,1.031959,1.748690,2.346989,-0.524185,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223730,1.704635,-0.546608,0.924678,-0.921129,0.649680,-0.071993,-0.164726,0.0,0.902862,0.717443,-0.677002,1.164691,-0.559740,-0.128906,0.0,4,0.0,5,0.0,1,0,1,0,0,0,-0.972791,-0.842987,-0.585670,-0.524185,2,2
223731,-0.669826,-0.502330,1.960771,0.311732,0.334267,0.064174,0.167165,0.0,1.217278,0.717443,-0.677002,-0.649573,-0.165966,-0.128906,0.0,3,0.0,3,7.0,0,0,1,0,0,0,-0.972791,-0.842987,-0.585670,-0.524185,2,2
223732,-0.203593,0.150162,-0.618814,0.941090,-0.252072,0.034595,-0.106987,0.0,-1.641203,0.717443,-0.677002,-0.649573,1.259777,-0.128906,0.0,4,0.0,5,0.0,0,0,1,0,0,0,-0.972791,-0.842987,-0.585670,-0.524185,0,0
223733,-0.088817,1.819850,-0.495204,1.380572,0.656013,-0.123034,-0.151012,0.0,0.772423,0.872008,-0.677002,-0.649573,-1.274038,-0.128906,0.0,3,0.0,6,0.0,0,1,0,0,0,0,-0.972791,-0.842987,-0.585670,-0.524185,2,2


## Remove duplicates and Sampling a chunk

In [None]:
X_train['TARGET'] = y_train
X_test['TARGET'] = y_test

print(f'X_train.shape com noise: {X_train.shape}')

# add id column to remove similiar/noise samples
X_train['ID'] = np.arange(len(X_train))

# filter
X_train = X_train[~X_train.ID.isin(similar_ids)]

# drop ID column
X_train.drop(columns='ID', inplace=True)
 
print(f'X_train.shape com duplicates: {X_train.shape}')
print(f'X_test.shape  com duplicates: {X_test.shape}')
 
X_train.drop_duplicates(inplace=True)
X_test.drop_duplicates(inplace=True)
 
print(f'X_train.shape sem duplicates: {X_train.shape}')
print(f'X_test.shape  sem duplicates: {X_test.shape}')
 
# Sampling dataframe
SAMPLING = False
if SAMPLING:
    data_list = []
    for target_class in range(X_train.TARGET.nunique()):
        data = X_train[X_train.TARGET == target_class].sample(frac=0.001)
        data_list.append(data)
    
    X_train = pd.concat(data_list).sample(frac=1).reset_index(drop=True)
    print(f'\nX_train.shape with sampling: {X_train.shape}')
    X_train

X_train.shape com noise: (223735, 32)
X_train.shape com duplicates: (222723, 32)
X_test.shape  com duplicates: (20050, 32)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


X_train.shape sem duplicates: (214424, 32)
X_test.shape  sem duplicates: (19691, 32)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Build triplet dataset

In [None]:
# TRAIN dict: key[int]-> value: feats-one row df 
featid_TRAIN = {k:v[1] for k,v in enumerate(X_train.iloc[:, :-1].iterrows())}
assert len(featid_TRAIN) == X_train.shape[0]
targetid_TRAIN = {k:v[1].values for k,v in enumerate(X_train.iloc[:, -1:].iterrows())}
assert len(targetid_TRAIN) == X_train.shape[0]
 
# TEST dict: key[int]-> value: feats-one row df 
featid_TEST = {k:v[1] for k,v in enumerate(X_test.iloc[:, :-1].iterrows())}
assert len(featid_TEST) == X_test.shape[0]
targetid_TEST = {k:v[1].values for k,v in enumerate(X_test.iloc[:, -1:].iterrows())}
assert len(targetid_TEST) == X_test.shape[0]

# make df triplets
def make_avip_triplets(featid:dict, targetid:dict, all_labels:list, n_negatives=3):
    if n_negatives >= len(all_labels):
        n_negatives = len(all_labels)-1 # -1 is the true label

    feat_list, pos_target, neg_target = [], [], []
    for idx in featid.keys():
        pos_label = int(targetid[idx])
        
        neg_label = np.random.choice(
            list((set(all_labels) - set([pos_label]))), size=(n_negatives,), replace=False)
        for n in range(n_negatives):
            pos_target.append(pos_label)
            neg_target.append(neg_label[n])
            feat_list.append(idx)
 
    assert len(pos_target) == len(neg_target) == len(feat_list)
    
    data_TRIPLET = pd.DataFrame(
        {
            'feat_id': feat_list, 
            'pos_target': pos_target, 
            'neg_target': neg_target,
        })
    
    return data_TRIPLET
 
# ✨ - - - - - running triplets
all_labels = list(X_train.TARGET.unique())
df_train_TRIPLET = make_avip_triplets(
    featid=featid_TRAIN, 
    targetid=targetid_TRAIN, 
    all_labels=all_labels, 
    n_negatives=6,
    )
df_test_TRIPLET = make_avip_triplets(
    featid=featid_TEST, 
    targetid=targetid_TEST, 
    all_labels=all_labels, 
    n_negatives=1,
    ) 

df_train_TRIPLET

Unnamed: 0,feat_id,pos_target,neg_target
0,0,5,1
1,0,5,0
2,0,5,2
3,0,5,4
4,0,5,6
...,...,...,...
1286539,214423,1,4
1286540,214423,1,3
1286541,214423,1,5
1286542,214423,1,2


# Build Dataset and Dataloader

In [None]:
BSIZE = 1024

class AVIGenerator():
    def __init__(self, data_triplet, featid_data, num_cols, cat_cols):
        super().__init__()
 
        self.featid_data = featid_data
        self.feat_id = data_triplet.feat_id.values
        self.pos_target = data_triplet.pos_target.values
        self.neg_target = data_triplet.neg_target.values
        
        self.num_cols = num_cols
        self.cat_cols = cat_cols
            
    def __len__(self):
        return self.feat_id.shape[0]
    
    def __getitem__(self, idx):
        cat_feats = torch.LongTensor(self.featid_data[self.feat_id[idx]][self.cat_cols].values)
        num_feats = torch.FloatTensor(self.featid_data[self.feat_id[idx]][self.num_cols].values)
        pos_target = torch.tensor(self.pos_target[idx])
        neg_target = torch.tensor(self.neg_target[idx])
            
        return cat_feats, num_feats, pos_target, neg_target
 
ds_train = AVIGenerator(df_train_TRIPLET, featid_TRAIN, data_feats['num_feats'], data_feats['cat_feats'])
ds_test  = AVIGenerator(df_test_TRIPLET,  featid_TEST,  data_feats['num_feats'], data_feats['cat_feats'])

dataloaders = {
    'train': DataLoader(
        ds_train, 
        batch_size=BSIZE,
        shuffle=True, 
        num_workers=os.cpu_count()),
    'test': DataLoader(
        ds_test, 
        batch_size=BSIZE,
        shuffle=False,
        num_workers=os.cpu_count())
    }

# ✨ - - - - - running dataset and Dataloader
ds = AVIGenerator(
        df_train_TRIPLET,
        featid_TRAIN, 
        data_feats['num_feats'], 
        data_feats['cat_feats'], 
)
cat_feats, num_feats, pos_target, neg_target = ds[0]
cat_feats_b, num_feats_b, pos_target_b, neg_target_b = next(iter(dataloaders['train']))
cat_feats_b.shape, num_feats_b.shape, pos_target_b.shape, neg_target_b.shape

(torch.Size([1024, 3]),
 torch.Size([1024, 28]),
 torch.Size([1024]),
 torch.Size([1024]))

# Model

In [None]:
class NNRanking(torch.nn.Module):
    def __init__(self, embedding_sizes=data_feats['emb_szs'], n_cont=len(data_feats['num_feats']), 
                 n_classes=len(all_labels), dmodel=128, p=2):
        super().__init__()
        
        self.p = p

        self.embds = torch.nn.ModuleList(
            [torch.nn.Embedding(categories, size) for categories, size in embedding_sizes])
        
        n_emb_sum = sum(e.embedding_dim for e in self.embds) 
        self.actv = torch.nn.ReLU()
        self.drop = torch.nn.Dropout()
        self.linear_1 = torch.nn.Linear(n_emb_sum + n_cont, 8 * dmodel)
        self.linear_2 = torch.nn.Linear(8 * dmodel, 8 * dmodel)
        self.linear_3 = torch.nn.Linear(8 * dmodel, dmodel//2)
        
        self.embds_class = torch.nn.Embedding(n_classes, dmodel//2)

    def forward(self, cat_feats, num_feats, pos_targets, neg_targets):
        
        # processing anchor -> feats
        feats = self.process_feats(cat_feats, num_feats)

        # processing neg targets 
        neg_targets = self.process_target(neg_targets)

        # processing pos targets
        pos_targets = self.process_target(pos_targets)

        return feats, pos_targets, neg_targets
    
    def process_feats(self, cat_feats, num_feats):
        cats = [e(cat_feats[:,i]) for i, e in enumerate(self.embds)]
        cats = torch.cat(cats, dim=1).to(device)
        feats = torch.cat((num_feats, cats), dim=1)
        feats = self.linear_1(feats)
        feats = self.drop(self.actv(feats))
        feats = self.linear_2(feats)
        feats = self.drop(self.actv(feats))
        feats = self.linear_3(feats)
        return torch.nn.functional.normalize(feats, p=self.p, dim=-1)

    def process_target(self, target):
        embd_target = self.embds_class(target)
        return torch.nn.functional.normalize(embd_target, p=self.p, dim=-1)             

def init_weights(m):
    if type(m) == torch.nn.Linear or type(m) == torch.nn.Embedding:
        torch.nn.init.xavier_uniform_(m.weight)

# ✨ - - - - - running NNRanking
Net = NNRanking(dmodel=256, p=1).to(device)
 

 # init weights
Net.apply(init_weights)

anchor, pos, neg = Net(cat_feats_b.to(device), 
    num_feats_b.to(device), 
    pos_target_b.to(device),
    neg_target_b.to(device), 
    ) 
anchor.shape, pos.shape, neg.shape

(torch.Size([1024, 128]), torch.Size([1024, 128]), torch.Size([1024, 128]))

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
 
_ = 48
print('\n','=' * _,f'\n # The model has {count_parameters(Net):,}' \
       ' trainable parameters #\n', '=' * _,'\n' )
Net


 # The model has 5,171,142 trainable parameters #



NNRanking(
  (embds): ModuleList(
    (0): Embedding(45, 30)
    (1): Embedding(251, 128)
    (2): Embedding(258, 128)
  )
  (actv): ReLU()
  (drop): Dropout(p=0.5, inplace=False)
  (linear_1): Linear(in_features=314, out_features=2048, bias=True)
  (linear_2): Linear(in_features=2048, out_features=2048, bias=True)
  (linear_3): Linear(in_features=2048, out_features=128, bias=True)
  (embds_class): Embedding(7, 128)
)

In [None]:
# anchor->feats, pos->embd_classe_pos, neg->embd_classe_neg 
def triple_loss(anchor, pos, neg, margin=0.3, p=2): 
    d = torch.nn.PairwiseDistance(p=2)
    distance = margin - (d(anchor, pos) - d(anchor, neg)) 
    loss = torch.mean(torch.max(distance, torch.zeros_like(distance))) 
    return loss    

def train(model, dataloader, device, loss_fct=triple_loss, margin=0.3, p=2):
    model.train()
    train_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        cat_feats, num_feats, pos_target, neg_target = batch

        anchor, pos, neg = model(
            cat_feats.to(device), 
            num_feats.to(device), 
            pos_target.to(device), 
            neg_target.to(device), 
            )
        
        loss = loss_fct(anchor, pos, neg, margin, p)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    total_loss = train_loss/len(dataloader)
    return total_loss

class Metrics:
    def __init__(self, mrr_depths:set, recall_depths:set, success_depths:set):
        self.results = {}
        self.mrr_sums = {depth:0.0 for depth in mrr_depths}
        self.recall_sums = {depth:0.0 for depth in recall_depths}
        self.success_sums = {depth:0.0 for depth in success_depths}

    def get_result(self, query_idx, query_key, ranking, gold_positives):
        assert query_key not in self.results
        assert len(self.results) <= query_idx
        assert len(set(gold_positives)) == len(gold_positives)
        assert len(set([pid for pid in ranking])) == len(ranking)

        self.results[query_key] = ranking

        positives = [i for i, hit in enumerate(ranking) if hit in gold_positives]

        if len(positives) == 0:
            return

        for depth in self.mrr_sums:
            first_positive = positives[0]
            self.mrr_sums[depth] += (1.0 / (first_positive+1.0)) if first_positive < depth else 0.0

        for depth in self.success_sums:
            first_positive = positives[0]
            self.success_sums[depth] += 1.0 if first_positive < depth else 0.0

        for depth in self.recall_sums:
            num_positives_up_to_depth = len([pos for pos in positives if pos < depth])
            self.recall_sums[depth] += num_positives_up_to_depth / len(gold_positives)

    def print_metrics(self, query_idx):
        print('- '*10)
        for depth in sorted(self.mrr_sums):
            mrr_value =  self.mrr_sums[depth] / (query_idx+1.0)
            print(f"MRR@{str(depth):<2} = {mrr_value:.3}")
        
        print('- '*10)
        for depth in sorted(self.recall_sums):
            recall_value = self.recall_sums[depth] / (query_idx+1.0)
            print(f"Recall@{str(depth):<2} = {recall_value:.3}")
        print('- '*10)
        for depth in sorted(self.success_sums):
            success_value = self.success_sums[depth] / (query_idx+1.0)
            print(f"Success@{str(depth):<2} = {success_value:.3}")
        print('- '*10)

def predict_one_sample(model, cat_feats, num_feats, target, num_classes, device='cpu', assuntos=None):
    model.eval()
    with torch.no_grad():
        classes = model.process_target(torch.tensor(list(assuntos.keys())).to(device))
        
        feat = model.process_feats(
            cat_feats.unsqueeze(0).to(device), 
            num_feats.unsqueeze(0).to(device),
            )

        d = torch.nn.PairwiseDistance(p=2)
        score = d(feat, classes)

    pred = score.argmax(-1).cpu().numpy().tolist()
    true = target.cpu().numpy().tolist()

    score = score.detach().cpu()
    score = score.sort(descending=True)

    ranked = score.indices.tolist()

    if assuntos is not None:
        ranked_str = [assuntos[e] for e in ranked]
        true_str = assuntos[true]
    ranked_score = score.values.tolist()

    return ranked, ranked_str, true, true_str, ranked_score

def eval_ranking(model, dataset, featid, device=device, assuntos=int_to_class, metrics=None, verbose=False, run_all=None):
    keys = sorted(list(featid.keys()))
    random.shuffle(keys)

    preds, trues = [], []
    for query_idx, qid in enumerate(keys):
        query = featid[qid]
        cat_feats, num_feats, target,_ = dataset[query_idx]
        
        ranked, ranked_str, true, true_str, ranked_score = predict_one_sample(
            model=model, 
            cat_feats=cat_feats, 
            num_feats=num_feats, 
            target=target, 
            num_classes=len(assuntos), 
            device=device, 
            assuntos=assuntos,
            )
        preds.append(data_feats['assuntos'][ranked_str[0]])
        trues.append(data_feats['assuntos'][true_str])
        
        if metrics is not None:
            metrics.get_result(
                query_idx=query_idx, 
                query_key=qid, 
                ranking=ranked, 
                gold_positives=[true],
                )
        
        if metrics is not None and verbose:
            if query_idx % 1_000==0 and query_idx !=0:
                print(f'\nRanking metrics at {query_idx} examples')
                metrics.print_metrics(query_idx)
        if query_idx == run_all: 
            break
    f1 = f1_score(trues, preds, average="macro")
    print(f'F1: {f1:<5.3}', end=' -- time:')
    
    return trues, preds, f1

_, _, f1 = eval_ranking(model=Net, dataset=ds_test, featid=featid_TEST)

F1: 0.0723 -- time:

#✨ Run training

In [None]:
try:
    del Net
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass
deterministic()

#norm type
p_norm = 2

# Network
Net = NNRanking(dmodel=256, p=p_norm).to(device)

# init Network weights 
Net.apply(init_weights)

# embd class freezed?
FREEZE = True
if FREEZE:
    Net.embds_class.weight.requires_grad = False
 
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, Net.parameters()))

N_EPOCHS = 30
best_f1 = 0
timer = MagicTimer()
print(f'\n>>> Training for {N_EPOCHS} epochs <<<\n')

# ---------------------------------------------------------------------------------------
for i in range(1, N_EPOCHS+1):
    train_loss = train(
        model=Net, 
        dataloader=dataloaders['train'], 
        device=device, 
        margin=0.3,
        p=p_norm,
        )
    
    print(f'Epoch [{i:>2}/{N_EPOCHS} ] -- train loss: {(train_loss):.4f}', end=' ✨ ')
    _, _, f1 = eval_ranking(
        model=Net, 
        dataset=ds_test, 
        featid=featid_TEST, 
        )    
    print(f' {timer}')

    if f1 > best_f1:
        best_f1 = f1
        torch.save(Net.state_dict(), 'Epoch.pth')
        print(f'Saving epoch {i}')

Deterministic experiment, seed: 341

>>> Training for 30 epochs <<<

Epoch [ 1/30 ] -- train loss: 0.1516 ✨ F1: 0.394 -- time: 8.8 minutes
Saving epoch 1
Epoch [ 2/30 ] -- train loss: 0.1435 ✨ F1: 0.389 -- time: 18 minutes
Epoch [ 3/30 ] -- train loss: 0.1392 ✨ F1: 0.404 -- time: 27 minutes
Saving epoch 3
Epoch [ 4/30 ] -- train loss: 0.1352 ✨ F1: 0.404 -- time: 35 minutes
Saving epoch 4
Epoch [ 5/30 ] -- train loss: 0.1313 ✨ F1: 0.4   -- time: 44 minutes
Epoch [ 6/30 ] -- train loss: 0.1278 ✨ F1: 0.401 -- time: 53 minutes
Epoch [ 7/30 ] -- train loss: 0.1243 ✨ F1: 0.399 -- time: 1.1 hours
Epoch [ 8/30 ] -- train loss: 0.1212 ✨ F1: 0.398 -- time: 1.2 hours
Epoch [ 9/30 ] -- train loss: 0.1184 ✨ F1: 0.394 -- time: 1.4 hours
Epoch [10/30 ] -- train loss: 0.1159 ✨ F1: 0.396 -- time: 1.5 hours
Epoch [11/30 ] -- train loss: 0.1135 ✨ F1: 0.396 -- time: 1.6 hours
Epoch [12/30 ] -- train loss: 0.1113 ✨ F1: 0.395 -- time: 1.8 hours
Epoch [13/30 ] -- train loss: 0.1093 ✨ F1: 0.396 -- time: 1.9 h

#✨ Run best F$_1$ epoch

In [None]:
Net.load_state_dict(torch.load('/content/Epoch.pth'))

def show_rank_vector(model, dataset, assuntos=int_to_class, data_feats=data_feats, k=50):
    preds, trues = [], []
    for idx in range(len(ds_test)):
        cat_feats, num_feats, pos_target, _ = dataset[idx]
        ranked, ranked_str, true, true_str, ranked_score = predict_one_sample(
                model=model, 
                cat_feats=cat_feats, 
                num_feats=num_feats, 
                target=pos_target, 
                num_classes=len(assuntos), 
                device=device, 
                assuntos=assuntos,
                )
        preds.append(data_feats['assuntos'][ranked_str[0]])
        trues.append(data_feats['assuntos'][true_str])

        print('Ranking Preds', ranked_str)
        print('Scores', ranked_score)
        print('True Label:', true_str)
        print()
        if idx == k: break

# ✨ - - - - - running ranked vector
show_rank_vector(model=Net, dataset=ds_test, assuntos=int_to_class, data_feats=data_feats, k=50)
print('\n\n')

# ✨ - - - - - running eval in best epoch with ranking metrics
metrics = Metrics(mrr_depths={1, 2, 3}, recall_depths= {1, 2, 3}, success_depths={1, 2, 3})
trues, preds, _ = eval_ranking(model=Net, dataset=ds_test, featid=featid_TEST, metrics=metrics, verbose=True)    

Ranking Preds ['avaliação emergencial de crédito', 'desbloquear o cartão', 'contestar transações', 'rotativo - pagamento mínimo', 'renegociação - palavra curta', 'cancelamento do cartão de crédito', 'alterar limite do cartão de crédito']
Scores [1.7062146663665771, 1.6441941261291504, 1.599846601486206, 1.244102120399475, 1.2013646364212036, 1.1558399200439453, 1.1428910493850708]
True Label: rotativo - pagamento mínimo

Ranking Preds ['avaliação emergencial de crédito', 'desbloquear o cartão', 'contestar transações', 'alterar limite do cartão de crédito', 'cancelamento do cartão de crédito', 'renegociação - palavra curta', 'rotativo - pagamento mínimo']
Scores [1.8387058973312378, 1.611733317375183, 1.511478304862976, 1.367668628692627, 1.2393641471862793, 1.2189874649047852, 1.201995849609375]
True Label: rotativo - pagamento mínimo

Ranking Preds ['rotativo - pagamento mínimo', 'avaliação emergencial de crédito', 'desbloquear o cartão', 'renegociação - palavra curta', 'contestar tra

## Eval Metrics

In [None]:
# predictions 
preds_dist = pd.DataFrame(preds)[0].apply(lambda x: int_to_class[x]).value_counts().sort_index()
preds_dist

alterar limite do cartão de crédito    5420
avaliação emergencial de crédito       4649
cancelamento do cartão de crédito      1531
contestar transações                   3618
desbloquear o cartão                   2873
renegociação - palavra curta            436
rotativo - pagamento mínimo            1164
Name: 0, dtype: int64

In [None]:
# trues 
trues_dist = pd.DataFrame(trues)[0].apply(lambda x: int_to_class[x]).value_counts().sort_index()
trues_dist

alterar limite do cartão de crédito    6347
avaliação emergencial de crédito       3855
cancelamento do cartão de crédito      1740
contestar transações                   2781
desbloquear o cartão                   2792
renegociação - palavra curta           1114
rotativo - pagamento mínimo            1062
Name: 0, dtype: int64

## DML (Deep Metric Learning) - Ranking - Result test dataset

In [None]:
print(classification_report(trues, preds, target_names=list(data_feats['assuntos'].keys())))

                                     precision    recall  f1-score   support

  cancelamento do cartão de crédito       0.37      0.32      0.35      1740
   avaliação emergencial de crédito       0.61      0.73      0.66      3855
               contestar transações       0.35      0.45      0.40      2781
               desbloquear o cartão       0.35      0.36      0.35      2792
        rotativo - pagamento mínimo       0.30      0.33      0.32      1062
alterar limite do cartão de crédito       0.61      0.52      0.57      6347
       renegociação - palavra curta       0.34      0.13      0.19      1114

                           accuracy                           0.48     19691
                          macro avg       0.42      0.41      0.40     19691
                       weighted avg       0.48      0.48      0.48     19691



# Classificação
                                         precision   recall   f1-score  support    
    cancelamento do cartão de crédito       0.36      0.33      0.34      1764 
    avaliação emergencial de crédito        0.61      0.73      0.66      3976 
    contestar transações                    0.31      0.54      0.39      2815 
    desbloquear o cartão                    0.35      0.35      0.35      2836 
    rotativo - pagamento mínimo             0.33      0.29      0.31      1081 
    alterar limite do cartão de crédito     0.66      0.47      0.55      6463 
    renegociação - palavra curta            0.39      0.12      0.18      1115 

    accuracy                                                    0.47     20050
    macro avg                               0.43      0.40      0.40     20050
    weighted avg                            0.50      0.47      0.47     20050