In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['PYTHONPATH'] = '/Users/g.sarapulov/Projects/draft/archtorch'
import sys
sys.path.append('/Users/g.sarapulov/Projects/draft/archtorch')

In [781]:
from data.input import FileParser, DataSampler, SiameseData, EncodedData
from loss.margin import ContrastiveLoss
from network.siamese import Siamese, Encoder, Discriminator
from train.fit import NetworkTrainer

In [4]:
path_prefix = '/Users/g.sarapulov/MLProjects/uv/'
train_path = path_prefix+'train_db/'
test_path = path_prefix+'test_db/'
parser = FileParser('.txt', ' ')

In [5]:
train_dataset = SiameseData(parser, train_path)
test_dataset = SiameseData(parser, test_path)

In [782]:
model = Siamese(Encoder())
loss = ContrastiveLoss(margin=1.)
trainer = NetworkTrainer(train_dataset, test_dataset, log_interval=500)

In [783]:
model = trainer.fit_model(model, loss, lr=1e-3, n_epochs=20)

Epoch: 1/20. Train set: Average loss: 0.0311
Epoch: 1/20. Validation set: Average loss: 0.0291
Epoch: 2/20. Train set: Average loss: 0.0265
Epoch: 2/20. Validation set: Average loss: 0.0269
Epoch: 3/20. Train set: Average loss: 0.0250
Epoch: 3/20. Validation set: Average loss: 0.0282
Epoch: 4/20. Train set: Average loss: 0.0246
Epoch: 4/20. Validation set: Average loss: 0.0300
Epoch: 5/20. Train set: Average loss: 0.0238
Epoch: 5/20. Validation set: Average loss: 0.0283
Epoch: 6/20. Train set: Average loss: 0.0231
Epoch: 6/20. Validation set: Average loss: 0.0273
Epoch: 7/20. Train set: Average loss: 0.0227
Epoch: 7/20. Validation set: Average loss: 0.0286
Epoch: 8/20. Train set: Average loss: 0.0218
Epoch: 8/20. Validation set: Average loss: 0.0262
Epoch: 9/20. Train set: Average loss: 0.0213
Epoch: 9/20. Validation set: Average loss: 0.0275
Epoch: 10/20. Train set: Average loss: 0.0206
Epoch: 10/20. Validation set: Average loss: 0.0287
Epoch: 11/20. Train set: Average loss: 0.0205
Ep

In [784]:
from torch import exp
import numpy as np
def calc_labels(data, model):
    scores = []
    labels = []
    for i in range(len(data)):
        ((x0, x1), t) = data.__getitem__(i)
        x0, x1 = model(x0, x1)
        # score = exp(-(x0 - x1).abs().sum())  # L1
        score = exp(-(x0 - x1).norm(2))  # eucledian
        # score = (x0 * x1).sum() / x0.norm(2) / x1.norm(2)  # cosine
        scores.append(float(score))
        labels.append(t)
    return np.array(scores), labels

In [785]:
scores, labels = calc_labels(test_dataset, model)

In [786]:
scores[:5]

array([0.6491127 , 0.46002039, 0.69302166, 0.61242247, 0.69981092])

In [787]:
from sklearn import metrics as m
def meter(probs, yval, thr):
    threshold = thr
    print('roc', m.roc_auc_score(yval, probs))
    print('f1', m.f1_score(yval, probs > threshold))
    print('accuracy', m.accuracy_score(yval, probs > threshold))
    print('precision', m.precision_score(yval, probs > threshold))
    print('recall', m.recall_score(yval, probs > threshold))
    cm = m.confusion_matrix(yval, probs > threshold)
    print('false acceptance', cm[0, 1] / cm[0, :].sum())
    print('false rejection', cm[1, 0] / cm[1, :].sum())
    print(cm)

In [788]:
meter(scores, labels, 0.5)

roc 0.9572200051096369
f1 0.8891213389121339
accuracy 0.8887722980062959
precision 0.8966244725738397
recall 0.8817427385892116
false acceptance 0.1040339702760085
false rejection 0.11825726141078838
[[422  49]
 [ 57 425]]


In [679]:
meter(scores, labels, 0.5)

roc 0.9669554455445545
f1 0.8952590959206174
accuracy 0.9003147953830011
precision 0.8845315904139434
recall 0.90625
false acceptance 0.10495049504950495
false rejection 0.09375
[[452  53]
 [ 42 406]]


In [681]:
scores, labels = calc_labels(train_dataset, model)

In [682]:
meter(scores, labels, 0.45)

roc 0.982190850117159
f1 0.9333703600851616
accuracy 0.9311644097620687
precision 0.9017026756331378
recall 0.9673433362753752
false acceptance 0.10478551000953289
false rejection 0.03265666372462489
[[23477  2748]
 [  851 25208]]


In [683]:
from torch import exp
import numpy as np
def calc_all_labels(data, model):
    scores = []
    labels = []
    for i in range(len(data)):
        ((x0, x1), t) = data.get_all_pairs(i)
        for j, tt in enumerate(t):
            xx0, xx1 = model(x0[j], x1[j])
            # score = exp((xx0 - xx1).pow(2).sum())
            score = exp(-(xx0 - xx1).norm(2))
            scores.append(float(score))
            labels.append(tt)
    return np.array(scores), labels

In [684]:
scores_all, labels_all = calc_all_labels(test_dataset, model)

In [689]:
meter(scores_all, labels_all, 0.52)

roc 0.960803275749552
f1 0.14861720852131338
accuracy 0.9073265800484475
precision 0.08135285388380695
recall 0.858177570093458
false acceptance 0.09220577119736784
false rejection 0.14182242990654206
[[816688  82952]
 [  1214   7346]]


### Fit discriminator

In [518]:
enc_train = EncodedData(train_dataset, model)
enc_test = EncodedData(test_dataset, model)

In [530]:
import torch.nn as nn
class LogisticNet(nn.Module):
    def __init__(self):
        super(LogisticNet, self).__init__()
        self.discrimination_net = nn.Sequential(nn.Linear(20, 2), nn.Softmax())

    def forward(self, x1, x2):
        output = self.discrimination_net(x1 - x2)
        return output

In [533]:
discr = LogisticNet()
loss = CrossEntropyLoss()
trainer_discr = NetworkTrainer(enc_train, enc_test, log_interval=500)
discr = trainer_discr.fit_model(discr, loss, lr=1e-3, n_epochs=1)

  input = module(input)


Epoch: 1/1. Train set: Average loss: 0.6942
Epoch: 1/1. Validation set: Average loss: 0.6942


In [536]:
from torch import exp
import numpy as np
def calc_labels_discr(data, model):
    scores = []
    labels = []
    for i in range(len(data)):
        ((x0, x1), t) = data.__getitem__(i)
        score = model(x0, x1)[0]
        print(score)
        # score = exp(-(x0 - x1).abs().sum())  # L1
        # score = exp(-(x0 - x1).norm(2))  # eucledian
        # score = (x0 * x1).sum() / x0.norm(2) / x1.norm(2)  # cosine
        scores.append(float(score))
        labels.append(t)
    return np.array(scores), labels

In [537]:
scores, labels = calc_labels_discr(enc_test, discr)

  input = module(input)


tensor(0.4837)
tensor(0.4945)
tensor(0.4848)
tensor(0.5095)
tensor(0.5065)
tensor(0.5141)
tensor(0.4977)
tensor(0.4827)
tensor(0.4740)
tensor(0.4687)
tensor(0.4893)
tensor(0.5014)
tensor(0.4966)
tensor(0.4959)
tensor(0.5151)
tensor(0.5157)
tensor(0.4965)
tensor(0.5007)
tensor(0.4999)
tensor(0.5194)
tensor(0.4788)
tensor(0.5481)
tensor(0.4711)
tensor(0.4704)
tensor(0.4932)
tensor(0.5398)
tensor(0.5224)
tensor(0.5195)
tensor(0.4495)
tensor(0.5010)
tensor(0.5174)
tensor(0.4903)
tensor(0.4028)
tensor(0.4571)
tensor(0.4648)
tensor(0.4642)
tensor(0.5301)
tensor(0.5086)
tensor(0.5331)
tensor(0.5178)
tensor(0.5306)
tensor(0.5166)
tensor(0.4973)
tensor(0.4895)
tensor(0.4904)
tensor(0.5250)
tensor(0.4790)
tensor(0.4637)
tensor(0.4779)
tensor(0.4971)
tensor(0.4733)
tensor(0.4895)
tensor(0.4861)
tensor(0.4530)
tensor(0.4861)
tensor(0.4829)
tensor(0.4901)
tensor(0.5223)
tensor(0.5493)
tensor(0.4661)
tensor(0.5059)
tensor(0.4895)
tensor(0.4516)
tensor(0.4978)
tensor(0.5120)
tensor(0.5039)
tensor(0.5

In [400]:
is_valid_bracket_seq('}{()[]}')

False

1. locations visited (visa centers, currency exchanges, tour agencies)
2. currency operations
3. booking activity (transport, hotels, tours)
4. travel-specific search activity (sightseeings, country history)
5. demographic data (age, gender, marital status)
6. employment status (unemployed/ on vacation)
7. seasonal data
8. last travels data (last trips recency, durations, destinations)

overfitting is a situation when the model if tuned to explain training data well but fails to generalize on examples not involved in training phase. main reason for this is high complexity of a model. there are several ways to avoid overfitting: reduce model complexity (a tradeoff between bias and variance), apply regularization techniques (constrain model parameters, add dropout), add more data to training dataset