In [1]:
import re
import multiprocessing
import os


from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


import procDataSet

In [2]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        n_hidden = 1000
        dense1 = nn.Linear(768 + 768, n_hidden)
        dense2 = nn.Linear(n_hidden, n_hidden // 2)
        dense3 = nn.Linear(n_hidden // 2, 1)

        self.DNN = nn.Sequential(
            dense1,
            nn.BatchNorm1d(n_hidden),
            nn.Dropout(.4),
            nn.SELU(),

            dense2,
            nn.BatchNorm1d(n_hidden // 2),
            nn.Dropout(.4),
            nn.SELU(),
            
            dense3,
            nn.Sigmoid()
        )

        # for l2 regularization
        self.regularizations = [dense1.weight, dense2.weight]

    
    def forward(self, query, text): 
        x = torch.cat([query, text], dim=1)
        return self.DNN(x)

def regularization(weights, alpha):
    rt = 0
    for w in weights:
        rt += torch.sum(torch.abs(w))
    return alpha * rt

def train(
    model, optimizer, scheduler, train_loader, valid_loader, num_epochs, 
#     job_name, 
    early_stop=True, no_improve_epochs=15, threshold=1e-3,
    alpha=0, grad_clip=1000
):
#     train_hist = []
    last_epoch_tune_lr = 0
    valid_tag = valid_loader is not None
    best_acc = .7  # it is a threshold
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = valid_loss = 0.
        regular = 0.
        for q, c, y in train_loader:
            q, c = q.cuda(), c.cuda()
            y = y.cuda()
            outputs = model(q, c)
            loss = loss_func(outputs, y.float())
            train_loss += loss.item()
            regular_batch = regularization(model.regularizations, alpha)
            loss += regular_batch
            regular += regular_batch

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

        train_loss /= len(train_loader)     
        regular /= len(train_loader)    
        total_loss = regular+train_loss
        report = 'Epoch [{}/{}], loss: {:.4f}, l1_regur: {:.4f}, total: {:.4f}'.format(
            epoch+1, num_epochs, train_loss, regular, total_loss
        )
#         train_hist.append((train_loss, regular, total_loss))
        
        scheduler.step(total_loss)
        
        if valid_tag:
            model.eval()
            for q, c, y in valid_loader:
                q, c = q.cuda(), c.cuda()
                y = y.cuda()
                outputs = model(q, c)
                loss = loss_func(outputs, y.float())
                valid_loss += loss.item()

            valid_loss /= len(valid_loader)
            report += ', valid loss: {:.4f}'.format(valid_loss)
#             train_hist[2:, epoch] = valid_loss, valid_acc
        print(report)      
        if early_stop and epoch > no_improve_epochs + scheduler.last_epoch and np.amin(train_hist[:-no_improve_epochs]) + threshold < np.amin(train_hist[-no_improve_epochs:]):
            print('Trigger early stop.')
            break

#     train_hist_fname = os.path.join('output', job_name + '_hist')
#     print('save training history to {}.npy'.format(train_hist_fname))
#     np.save(train_hist_fname, train_hist)

In [5]:
batch_size = 18000
NumberCPU = multiprocessing.cpu_count()

train_loader = DataLoader(
    procDataSet.TrainingQueryAll(validation_queries=[0], normalize01=True), 
    batch_size=batch_size, shuffle=True, num_workers=NumberCPU
)
valid_loader = DataLoader(
    procDataSet.TrainingQueryAll(validation_queries=[0], normalize01=True, is_valid=True), 
    batch_size=batch_size, shuffle=False, num_workers=NumberCPU
)

model = SimpleModel().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = ReduceLROnPlateau(
    optimizer, 'min', verbose=True,
    patience=2, factor=.1**.5, min_lr=1e-5, threshold=0, cooldown=15
)

loss_func = nn.MSELoss()

In [6]:
train(
    model, optimizer, scheduler, train_loader, valid_loader, 40,
    threshold=3e-4, alpha=1e-5, grad_clip=10
)

Epoch [1/40], loss: 0.0818, l1_regur: 0.1196, total: 0.2014, valid loss: 0.0011
Epoch [2/40], loss: 0.0587, l1_regur: 0.0479, total: 0.1066, valid loss: 0.0011
Epoch [3/40], loss: 0.0532, l1_regur: 0.0118, total: 0.0650, valid loss: 0.0011
Epoch [4/40], loss: 0.0594, l1_regur: 0.0607, total: 0.1201, valid loss: 0.0011
Epoch [5/40], loss: 0.0532, l1_regur: 0.0148, total: 0.0680, valid loss: 0.0011
Epoch [6/40], loss: 0.0532, l1_regur: 0.0052, total: 0.0584, valid loss: 0.0011
Epoch [7/40], loss: 0.0725, l1_regur: 0.0840, total: 0.1566, valid loss: 0.0011
Epoch [8/40], loss: 0.0007, l1_regur: 0.0793, total: 0.0800, valid loss: 0.0011
Epoch [9/40], loss: 0.0007, l1_regur: 0.0267, total: 0.0274, valid loss: 0.0011
Epoch [10/40], loss: 0.0007, l1_regur: 0.0101, total: 0.0108, valid loss: 0.0011
Epoch [11/40], loss: 0.0008, l1_regur: 0.0132, total: 0.0139, valid loss: 0.0011
Epoch [12/40], loss: 0.0007, l1_regur: 0.0106, total: 0.0113, valid loss: 0.0011
Epoch    12: reducing learning rate o

KeyboardInterrupt: 

In [7]:
batch_size = 100000
test_loader = DataLoader(procDataSet.TestQuery(), batch_size=batch_size, shuffle=False, num_workers=NumberCPU)

In [8]:
results = []
model.eval()
i = 0
with torch.no_grad():
    for q, c in test_loader:
        q, c = q.cuda(), c.cuda()
        outputs = model(q, c)
        results.append(outputs.cpu().data.numpy())
        i += 1
        print(i / len(test_loader))
results = np.concatenate(results, axis=0)


0.05
0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95
1.0


In [7]:
results = results.reshape(20, -1)
search_result = np.flip(np.argsort(results, axis=1), axis=1)

df = pd.DataFrame()
df['Query_Index'] = ['q_{:02d}'.format(i+1) for i in range(20)]

for i in range(300):
    df['Rank_{:03d}'.format(i+1)] = search_result[:, i]

for i, row in df.iterrows():
    df.iloc[i, 1:] = df.iloc[i, 1:].apply(lambda x: 'news_{:06d}'.format(x))
fname = 'simple.csv'
df.to_csv('output/' + fname,index=False)

In [10]:
results = results.reshape(20, -1)
np.where(results > .2)

(array([], dtype=int64), array([], dtype=int64))

In [10]:
results

array([[-0.00126912, -0.00113342, -0.00126483, ..., -0.00119252,
        -0.00125728, -0.0012664 ],
       [-0.00142819, -0.00129511, -0.00142516, ..., -0.00135495,
        -0.00141793, -0.0014262 ],
       [-0.0016579 , -0.00153257, -0.00165507, ..., -0.0015901 ,
        -0.00164867, -0.00165643],
       ...,
       [-0.00138995, -0.00125749, -0.00138701, ..., -0.00131617,
        -0.00137884, -0.00138806],
       [-0.00095255, -0.00080123, -0.000945  , ..., -0.00087102,
        -0.00093861, -0.00094846],
       [-0.00134824, -0.0012053 , -0.00134352, ..., -0.0012709 ,
        -0.00133587, -0.00134562]], dtype=float32)