In [1]:
import re
import multiprocessing
import os


from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


import procDataSet

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
np.set_printoptions(linewidth=140)
np.set_printoptions(formatter={'float': lambda x: "{:>6.2f}".format(x)})


In [9]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        n_hidden = 4000
        dense1 = nn.Linear(768 + 768, n_hidden)
        dense2 = nn.Linear(n_hidden, n_hidden)
        dense3 = nn.Linear(n_hidden, n_hidden // 2)
        dense4 = nn.Linear(n_hidden // 2, 1)

        self.DNN = nn.Sequential(
            dense1,
            nn.BatchNorm1d(n_hidden),
            nn.Dropout(.4),
            nn.PReLU(),

            dense2,
            nn.BatchNorm1d(n_hidden),
            nn.Dropout(.4),
            nn.PReLU(),

            dense3,
            nn.BatchNorm1d(n_hidden // 2),
            nn.Dropout(.4),
            nn.PReLU(),
            
            dense4,
            nn.Sigmoid()
        )

        # for l2 regularization
        self.regularizations = [dense1.weight, dense2.weight]

    
    def forward(self, query, text): 
        x = torch.cat([query, text], dim=1)
        return self.DNN(x)

def regularization(weights, alpha):
    rt = 0
    for w in weights:
        rt += torch.sum(torch.pow(w, 2))
    return alpha * rt


def weighted_mse_loss(y_pred, target, weight):
    return torch.sum(weight * (y_pred - target) ** 2) / torch.sum(weight)


def train(
    model, optimizer, train_loader, valid_loader, num_epochs, 
#     job_name, 
    scheduler=None,
    early_stop=True, no_improve_epochs=15, threshold=1e-3,
    alpha=0, grad_clip=1000
):
#     train_hist = []
    last_epoch_tune_lr = 0
    valid_tag = valid_loader is not None
    best_acc = .7  # it is a threshold
    for epoch in range(num_epochs):
        model.train()
        train_loss = valid_loss = 0.
        regular = 0.
        for q, c, y, w in train_loader:
            q, c = q.cuda(), c.cuda()
            y, w = y.cuda(), w.cuda()
            outputs = model(q, c)
#             loss = Criticizer(outputs, y.float())
            loss = weighted_mse_loss(outputs, y.float(), w)
            train_loss += loss.item()
            regular_batch = regularization(model.regularizations, alpha)
            loss += regular_batch
            regular += regular_batch

            optimizer.zero_grad()
            loss.backward()
            if grad_clip:
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
#             print('   Batch, loss: {:.4f}, l1_regur: {:.4f}, total: {:.4f}'.format(
#                 loss.item(), regular_batch, loss.item() + regular_batch
#             ))

        train_loss /= len(train_loader)     
        regular /= len(train_loader)    
        total_loss = regular+train_loss
        report = 'Epoch [{}/{}], loss: {:.4f}, l1_regur: {:.4f}, total: {:.4f}'.format(
            epoch+1, num_epochs, train_loss, regular, total_loss
        )
#         train_hist.append((train_loss, regular, total_loss))
        if scheduler is not None:
            scheduler.step(total_loss)
        
        if valid_tag:
            model.eval()
            with torch.no_grad():
                for q, c, y, w in valid_loader:
                    q, c = q.cuda(), c.cuda()
                    y, w = y.cuda(), w.cuda()
                    outputs = model(q, c)
    #                 loss = Criticizer(outputs, y.float())
                    loss = weighted_mse_loss(outputs, y.float(), w)
                    valid_loss += loss.item()

            valid_loss /= len(valid_loader)
            report += ', valid loss: {:.4f}'.format(valid_loss)
#             train_hist[2:, epoch] = valid_loss, valid_acc
        print(report)      
#         if early_stop and epoch > no_improve_epochs + scheduler.last_epoch and np.amin(train_hist[:-no_improve_epochs]) + threshold < np.amin(train_hist[-no_improve_epochs:]):
#             print('Trigger early stop.')
#             break

#     train_hist_fname = os.path.join('output', job_name + '_hist')
#     print('save training history to {}.npy'.format(train_hist_fname))
#     np.save(train_hist_fname, train_hist)

In [3]:
batch_size = 30000
NumberCPU = multiprocessing.cpu_count()

train_loader = DataLoader(
    procDataSet.TrainingQueryAll(validation_queries=[0], normalize01=True, weight=400), 
    batch_size=batch_size, shuffle=True, num_workers=NumberCPU
)
valid_loader = DataLoader(
    procDataSet.TrainingQueryAll(validation_queries=[0], normalize01=True, weight=400, is_valid=True), 
    batch_size=batch_size, shuffle=False, num_workers=NumberCPU
)

model = SimpleModel().cuda()

In [13]:
# optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, amsgrad=True)

# scheduler = ReduceLROnPlateau(
#     optimizer, 'min', verbose=True,
#     patience=2, factor=.1**.5, min_lr=1e-5, threshold=0, cooldown=15
# )

train(
    model, optimizer, train_loader, valid_loader, 20,
    threshold=3e-4, alpha=0, grad_clip=None
)

Epoch [1/20], loss: 0.1068, l1_regur: 0.0000, total: 0.1068, valid loss: 0.2268
Epoch [2/20], loss: 0.1073, l1_regur: 0.0000, total: 0.1073, valid loss: 0.2268
Epoch [3/20], loss: 0.1068, l1_regur: 0.0000, total: 0.1068, valid loss: 0.2272
Epoch [4/20], loss: 0.1068, l1_regur: 0.0000, total: 0.1068, valid loss: 0.2275
Epoch [5/20], loss: 0.1069, l1_regur: 0.0000, total: 0.1069, valid loss: 0.2274
Epoch [6/20], loss: 0.1065, l1_regur: 0.0000, total: 0.1065, valid loss: 0.2278
Epoch [7/20], loss: 0.1066, l1_regur: 0.0000, total: 0.1066, valid loss: 0.2278
Epoch [8/20], loss: 0.1069, l1_regur: 0.0000, total: 0.1069, valid loss: 0.2279
Epoch [9/20], loss: 0.1076, l1_regur: 0.0000, total: 0.1076, valid loss: 0.2290
Epoch [10/20], loss: 0.1067, l1_regur: 0.0000, total: 0.1067, valid loss: 0.2285
Epoch [11/20], loss: 0.1063, l1_regur: 0.0000, total: 0.1063, valid loss: 0.2290
Epoch [12/20], loss: 0.1069, l1_regur: 0.0000, total: 0.1069, valid loss: 0.2296
Epoch [13/20], loss: 0.1068, l1_regur

In [15]:
train(
    model, optimizer, train_loader, valid_loader, 20,
    threshold=3e-4, alpha=1e-5, grad_clip=None
)

Epoch [1/20], loss: 0.1059, l1_regur: 0.0286, total: 0.1345, valid loss: 0.2339
Epoch [2/20], loss: 0.1061, l1_regur: 0.0281, total: 0.1342, valid loss: 0.2345
Epoch [3/20], loss: 0.1061, l1_regur: 0.0276, total: 0.1337, valid loss: 0.2340
Epoch [4/20], loss: 0.1059, l1_regur: 0.0272, total: 0.1330, valid loss: 0.2337
Epoch [5/20], loss: 0.1059, l1_regur: 0.0268, total: 0.1326, valid loss: 0.2352
Epoch [6/20], loss: 0.1056, l1_regur: 0.0264, total: 0.1320, valid loss: 0.2349
Epoch [7/20], loss: 0.1053, l1_regur: 0.0260, total: 0.1313, valid loss: 0.2350
Epoch [8/20], loss: 0.1058, l1_regur: 0.0256, total: 0.1314, valid loss: 0.2359
Epoch [9/20], loss: 0.1054, l1_regur: 0.0253, total: 0.1306, valid loss: 0.2351
Epoch [10/20], loss: 0.1062, l1_regur: 0.0249, total: 0.1311, valid loss: 0.2355
Epoch [11/20], loss: 0.1052, l1_regur: 0.0246, total: 0.1298, valid loss: 0.2351
Epoch [12/20], loss: 0.1057, l1_regur: 0.0243, total: 0.1300, valid loss: 0.2353
Epoch [13/20], loss: 0.1059, l1_regur

In [14]:
# train w/o validation
torch.save(model.state_dict(), 'simple_re.pth')

train_loader = DataLoader(
    procDataSet.TrainingQueryAll(normalize01=True, weight=400, validation_queries=[]), 
    batch_size=batch_size, shuffle=True, num_workers=NumberCPU
)
train(
    model, optimizer, train_loader, None, 50,
    threshold=3e-4, alpha=1e-5, grad_clip=None
)

Epoch [1/50], loss: 0.1119, l1_regur: 0.0138, total: 0.1257
Epoch [2/50], loss: 0.1115, l1_regur: 0.0138, total: 0.1252
Epoch [3/50], loss: 0.1108, l1_regur: 0.0137, total: 0.1246
Epoch [4/50], loss: 0.1095, l1_regur: 0.0137, total: 0.1232
Epoch [5/50], loss: 0.1097, l1_regur: 0.0137, total: 0.1233
Epoch [6/50], loss: 0.1092, l1_regur: 0.0136, total: 0.1228
Epoch [7/50], loss: 0.1090, l1_regur: 0.0136, total: 0.1226
Epoch [8/50], loss: 0.1095, l1_regur: 0.0136, total: 0.1231
Epoch [9/50], loss: 0.1088, l1_regur: 0.0135, total: 0.1223
Epoch [10/50], loss: 0.1089, l1_regur: 0.0135, total: 0.1224
Epoch [11/50], loss: 0.1085, l1_regur: 0.0135, total: 0.1220
Epoch [12/50], loss: 0.1076, l1_regur: 0.0135, total: 0.1211
Epoch [13/50], loss: 0.1074, l1_regur: 0.0134, total: 0.1209
Epoch [14/50], loss: 0.1074, l1_regur: 0.0134, total: 0.1208
Epoch [15/50], loss: 0.1066, l1_regur: 0.0134, total: 0.1200
Epoch [16/50], loss: 0.1067, l1_regur: 0.0134, total: 0.1200
Epoch [17/50], loss: 0.1063, l1_r

In [15]:
batch_size = 100000
test_loader = DataLoader(procDataSet.TestQuery(), batch_size=batch_size, shuffle=False, num_workers=NumberCPU)

In [16]:
results = []
model.eval()
i = 0
with torch.no_grad():
    for q, c in test_loader:
        q, c = q.cuda(), c.cuda()
        outputs = model(q, c)
        results.append(outputs.cpu().data.numpy())
        i += 1
results = np.concatenate(results, axis=0)


In [17]:
results = results.reshape(20, -1)
search_result = np.flip(np.argsort(results, axis=1), axis=1)

df = pd.DataFrame()
df['Query_Index'] = ['q_{:02d}'.format(i+1) for i in range(20)]

for i in range(300):
    df['Rank_{:03d}'.format(i+1)] = search_result[:, i]

for i, row in df.iterrows():
    df.iloc[i, 1:] = df.iloc[i, 1:].apply(lambda x: 'news_{:06d}'.format(x))
fname = 'simple.csv'
df.to_csv('output/' + fname,index=False)

In [18]:
results_show = results.reshape(20, -1) * 3


In [24]:
results_show

array([[  1.17,   0.29,   0.42, ...,   0.56,   1.03,   1.14],
       [  1.19,   0.84,   0.46, ...,   1.38,   1.19,   1.38],
       [  0.78,   0.77,   0.14, ...,   1.52,   0.55,   1.04],
       ...,
       [  0.43,   0.04,   0.09, ...,   0.11,   0.38,   0.43],
       [  0.23,   0.04,   0.06, ...,   0.27,   0.17,   0.20],
       [  0.38,   0.17,   0.09, ...,   0.82,   1.04,   0.51]], dtype=float32)

In [20]:
len(np.where(results_show > 1.2)[0])

651141

In [21]:
a = set(df.iloc[:, 1:].values.flat)
print('selected news : {}'.format(len(a)))
td = pd.read_csv('data/TD.csv')
# td = td.iloc[230:]
b = set(td[td['Relevance'] != 0]['News_Index'])
print('不重複：{}'.format(len(a - set(td['Relevance']))))
print('和非0不重複：{}'.format(len(a - b)))

selected news : 2538
不重複：2538
和非0不重複：2362


In [25]:
np.flip(np.sort(results, axis=1), axis=1) * 3

array([[  2.22,   2.12,   2.10, ...,   0.00,   0.00,   0.00],
       [  2.13,   2.13,   2.12, ...,   0.01,   0.01,   0.00],
       [  2.27,   2.26,   2.21, ...,   0.01,   0.01,   0.00],
       ...,
       [  2.17,   2.10,   2.09, ...,   0.00,   0.00,   0.00],
       [  0.94,   0.94,   0.91, ...,   0.00,   0.00,   0.00],
       [  2.56,   2.53,   2.53, ...,   0.00,   0.00,   0.00]], dtype=float32)