In [1]:
import re
import multiprocessing
import os


from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


import procDataSet

In [2]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        n_hidden = 1000
        dense1 = nn.Linear(768 + 768, n_hidden)
        dense2 = nn.Linear(n_hidden, n_hidden // 2)
        dense3 = nn.Linear(n_hidden // 2, 1)

        self.DNN = nn.Sequential(
            dense1,
            nn.BatchNorm1d(n_hidden),
            nn.Dropout(.4),
            nn.SELU(),

            dense2,
            nn.BatchNorm1d(n_hidden // 2),
            nn.Dropout(.4),
            nn.SELU(),
            
            dense3,
            nn.SELU()
        )

        # for l2 regularization
        self.regularizations = [dense1.weight, dense2.weight]

    
    def forward(self, query, text): 
        x = torch.cat([query, text], dim=1)
        return self.DNN(x)

def regularization(weights, alpha):
    rt = 0
    for w in weights:
        rt += torch.sum(torch.abs(w))
    return alpha * rt

def train(
    model, optimizer, scheduler, train_loader, num_epochs, 
#     job_name, 
    early_stop=True, no_improve_epochs=15, threshold=1e-3,
    alpha=0, grad_clip=1000
):
#     train_hist = []
    last_epoch_tune_lr = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.
        regular = 0.

        for q, c, y in train_loader:
            q, c = q.cuda(), c.cuda()
            y = y.cuda()
            outputs = model(q, c)
            loss = loss_func(outputs, y.float())
            train_loss += loss.item()
            regular_batch = regularization(model.regularizations, alpha)
            loss += regular_batch
            regular += regular_batch

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

        train_loss /= len(train_loader)     
        regular /= len(train_loader)    
        total_loss = regular+train_loss
        print('Epoch [{}/{}], loss: {:.4f}, l1_regur: {:.4f}, total: {:.4f}'
              .format(epoch+1, num_epochs, train_loss, regular, total_loss))
#         train_hist.append((train_loss, regular, total_loss))
        
        scheduler.step(total_loss)
        if early_stop and epoch > no_improve_epochs + scheduler.last_epoch and np.amin(train_hist[:-no_improve_epochs]) + threshold < np.amin(train_hist[-no_improve_epochs:]):
            print('Trigger early stop.')
            break

#     train_hist_fname = os.path.join('output', job_name + '_hist')
#     print('save training history to {}.npy'.format(train_hist_fname))
#     np.save(train_hist_fname, train_hist)

In [3]:
batch_size = 2731

train_loader = DataLoader(procDataSet.TrainingQuery(), batch_size=batch_size, shuffle=True)

model = SimpleModel().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = ReduceLROnPlateau(
    optimizer, 'min', verbose=True,
    patience=2, factor=.1**.5, min_lr=1e-5, threshold=0, cooldown=15
)

loss_func = nn.MSELoss()

In [4]:
train(
    model, optimizer, scheduler, train_loader, 200,
    threshold=3e-4, alpha=1e-5, grad_clip=10
)

Epoch [1/200], loss: 3.9028, l1_regur: 0.2748, total: 4.1776
Epoch [2/200], loss: 3.3842, l1_regur: 0.2738, total: 3.6579
Epoch [3/200], loss: 2.6769, l1_regur: 0.2729, total: 2.9498
Epoch [4/200], loss: 2.3532, l1_regur: 0.2720, total: 2.6252
Epoch [5/200], loss: 2.1003, l1_regur: 0.2710, total: 2.3713
Epoch [6/200], loss: 1.9247, l1_regur: 0.2699, total: 2.1946
Epoch [7/200], loss: 1.7482, l1_regur: 0.2688, total: 2.0170
Epoch [8/200], loss: 1.5839, l1_regur: 0.2675, total: 1.8514
Epoch [9/200], loss: 1.4546, l1_regur: 0.2662, total: 1.7208
Epoch [10/200], loss: 1.3539, l1_regur: 0.2648, total: 1.6187
Epoch [11/200], loss: 1.2514, l1_regur: 0.2634, total: 1.5148
Epoch [12/200], loss: 1.1493, l1_regur: 0.2621, total: 1.4113
Epoch [13/200], loss: 1.0644, l1_regur: 0.2607, total: 1.3251
Epoch [14/200], loss: 0.9821, l1_regur: 0.2593, total: 1.2414
Epoch [15/200], loss: 0.9236, l1_regur: 0.2579, total: 1.1815
Epoch [16/200], loss: 0.8767, l1_regur: 0.2565, total: 1.1332
Epoch [17/200], l

Epoch [129/200], loss: 0.7751, l1_regur: 0.2238, total: 0.9989
Epoch [130/200], loss: 0.7712, l1_regur: 0.2238, total: 0.9949
Epoch [131/200], loss: 0.7733, l1_regur: 0.2238, total: 0.9971
Epoch [132/200], loss: 0.7751, l1_regur: 0.2237, total: 0.9988
Epoch [133/200], loss: 0.7741, l1_regur: 0.2237, total: 0.9978
Epoch [134/200], loss: 0.7700, l1_regur: 0.2237, total: 0.9937
Epoch [135/200], loss: 0.7710, l1_regur: 0.2236, total: 0.9947
Epoch [136/200], loss: 0.7729, l1_regur: 0.2236, total: 0.9965
Epoch [137/200], loss: 0.7719, l1_regur: 0.2236, total: 0.9955
Epoch [138/200], loss: 0.7682, l1_regur: 0.2236, total: 0.9918
Epoch [139/200], loss: 0.7683, l1_regur: 0.2235, total: 0.9918
Epoch [140/200], loss: 0.7717, l1_regur: 0.2235, total: 0.9952
Epoch [141/200], loss: 0.7739, l1_regur: 0.2235, total: 0.9974
Epoch [142/200], loss: 0.7726, l1_regur: 0.2234, total: 0.9960
Epoch [143/200], loss: 0.7710, l1_regur: 0.2234, total: 0.9944
Epoch [144/200], loss: 0.7744, l1_regur: 0.2234, total:

In [10]:
batch_size = 100000
test_loader = DataLoader(procDataSet.TestQuery(), batch_size=batch_size, shuffle=False)

In [11]:
results = []
model.eval()
i = 0
with torch.no_grad():
    for q, c in test_loader:
        q, c = q.cuda(), c.cuda()
        outputs = model(q, c)
        results.append(outputs.cpu().data.numpy())
        i += 1
        print(i / len(test_loader))
results = np.concatenate(results, axis=0)


0.05
0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95
1.0


In [62]:
results = results.reshape(20, -1)
search_result = np.argsort(results, axis=1)

df = pd.DataFrame()
df['Query_Index'] = ['q_{:02d}'.format(i+1) for i in range(20)]

for i in range(300):
    df['Rank_{:03d}'.format(i+1)] = search_result[:, i]

for i, row in df.iterrows():
    df.iloc[i, 1:] = df.iloc[i, 1:].apply(lambda x: 'news_{:06d}'.format(x))
fname = 'simple.csv'
df.to_csv('output/' + fname,index=False)