In [1]:
import pandas as pd
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrain import BertModel, BertTokenizer
from pytorch_pretrain import BertAdam
from bert_encoder import TokenEncode
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, accuracy_score
from collections import defaultdict
import string

In [3]:
path = os.listdir('./data/jy/jy/')
path_li = ['./data/jy/jy/'+i for i in path if 'json' in i]
def get_other_text(word):
    total_cnt = len(word.strip())
    cn_cnt = 0
    en_cnt = 0
    num_cnt = 0
    symbols_cnt = 0
    other_cnt = 0
    for ch in word:
        if '\u4e00' <= ch <= '\u9fff':
            cn_cnt = cn_cnt + 1
        elif ('a'<=ch<='z' or 'A'<=ch<='Z'):
            en_cnt = en_cnt + 1
        elif ch.isdigit():
            num_cnt = num_cnt + 1
        elif ch in string.punctuation:
            symbols_cnt = symbols_cnt + 1
        else:
            other_cnt = other_cnt + 1
    cn_radio = cn_cnt/total_cnt
    en_radio = en_cnt/total_cnt
    num_radio = num_cnt/total_cnt
    symbols_radio = symbols_cnt/total_cnt
    other_radio = other_cnt/total_cnt
    have_ = 1 if '-' in word else 0
    have_mao = 1 if ':' in word else 0
    have_yuan = 1 if '元' in word else 0
    have_dot = 1 if '.' in word else 0
    len_word = len(word)/30
    return [cn_radio, en_radio, num_radio, symbols_radio, other_radio, have_, have_mao, have_yuan, have_dot, len_word]

def get_one_label(path):
    with open(path, encoding='utf-8') as f:
        content = f.read()
        di = json.loads(content)
        shapes_li = di['shapes']
        h = di['imageHeight']
        w = di['imageWidth']
        li = list()
        for i in shapes_li:
            points = i['points']
            group_id = i['group_id'] if i['group_id'] else 0
            text = i['label']
            if len(points) == 2:
                x1 = points[0][0]/w
                y1 = points[0][1]/h
                x2 = points[1][0]/w
                y2 = points[1][1]/h
            else:
                x1 = points[0][0]/w
                y1 = points[0][1]/h
                x2 = points[2][0]/w
                y2 = points[2][1]/h
            ret_li = get_other_text(text)
            li.append([text, x1, y1, x2, y2, group_id, path] + ret_li)
        
        mx,my = 0,0
        total_li = list()
        for inner_li1 in li:
            x_li, y_li = list(), list()
            for inner_li2 in li:
                x_ = inner_li1[1] - inner_li2[1]
                y_ = inner_li1[2] - inner_li2[2]
                x_li.append(x_)
                y_li.append(y_)
            x_li.sort()
            y_li.sort()

            zerox = x_li.index(0.0)
            zeroy = y_li.index(0.0)

            left_x = x_li[zerox-1] if 0 <= zerox-1 <len(x_li) else 0
            right_x = x_li[zerox+1] if 0 <= zerox+1 <len(x_li) else 0
            left_y = y_li[zeroy-1] if 0 <= zeroy-1 <len(y_li) else 0
            right_y = y_li[zeroy+1] if 0 <= zeroy+1 <len(y_li) else 0
            x_li = x_li[:18]
            y_li = y_li[:18]
            total_li.append(inner_li1 + x_li + y_li + [left_x, right_x, left_y, right_y])
        return total_li

def get_df():
    li_total = list()
    for p in path_li:
        li = get_one_label(p)
        li_total.extend(li)
    print(len(li_total))
    df = pd.DataFrame(li_total)
    df.loc[df[5]==4]=3
    df.loc[df[5]==14]=4
    # print(df[5].value_counts())
    return df

df = get_df()

255


In [5]:
pad_size = 512
BERT_PATH = '../Bert-Chinese-Text-Classification-Pytorch/bert_pretrain/'
BATCH_SIZE = 5
INPUT_SIZE, OUTPUT_SIZE= 768, 14
# 1：物流，2：服务，3：家电
LABEL_INDEX = 2
LR = 1e-5
EPOCH = 100
ROUND = 3
F1 = 'macro'

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
token_encoder = TokenEncode(BERT_PATH, pad_size)
data = [[token_encoder.get_token_mask(str(i[0])), torch.tensor([i[1:5].tolist() + i[7:57].tolist()]).to(device), i[5]] for i in tqdm(df.values)] 

100%|██████████| 255/255 [00:00<00:00, 4148.60it/s]


In [7]:
random_order = range(len(data))
np.random.shuffle(list(random_order))
num = 8
train = [data[j] for i, j in enumerate(random_order) if i % num != 0]
valid = [data[j] for i, j in enumerate(random_order) if i % num == 0]

In [8]:
class MyDataset(Dataset):
    def __init__(self, data):
        super(MyDataset, self).__init__()
        self.data = data
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

In [9]:
train_dataset = MyDataset(train)
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=False)
valid_dataset = MyDataset(valid)
valid_iter = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=False)

In [10]:
next(iter(valid_iter))[1].shape

torch.Size([5, 1, 54])

In [11]:
len(train_iter)

45

In [12]:
class MyModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModel, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH).to(device)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.linear1 = torch.nn.Linear(512, 256)
        self.linear2 = torch.nn.Linear(54, 128)
        self.linear3 = torch.nn.Linear(768+128, 512)
        self.linear4 = torch.nn.Linear(256, output_size)
    def forward(self, x):
        _1, pool1 = self.bert(x[0][0].squeeze(1), None, x[0][1].squeeze(1))
        ret = self.linear2(x[1])
        # pool2 = self.linear1(pool1)
        return self.linear4(self.linear1(self.linear3(torch.cat([pool1, ret.squeeze(1)], dim=1))))

In [13]:
model = MyModel(INPUT_SIZE, OUTPUT_SIZE).to(device)
loss = torch.nn.CrossEntropyLoss()
loss.to(device)
param_optimizer = list(model.named_parameters())
# print(param_optimizer)
# 以下的层不进行参数的衰减
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# 其他的层进行参数的衰减
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

optimizer = BertAdam(optimizer_grouped_parameters, lr=LR, warmup=0.05, t_total=len(train_iter) * EPOCH)
# optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [14]:
def get_recall(y_li, y_hat_li, which):
    di = defaultdict(list)
    for ind, i in enumerate(y_li):
        if y_hat_li[ind] == i:
            di[i].append(1)
        else:
            di[i].append(0)
    for a,b in di.items():
        if len(b) > 0:
            di[a] = [round(sum(b)/len(b), ROUND), sum(b), len(b)]
    dis = sorted(di.items(), key=lambda x:x[0], reverse=False)
    print(which, dis)
    
def my_evaluate(model, val_loader):
    model.eval()
    y_li, y_hat_li, loss_li = list(), list(), list()
    for ind, x in enumerate(val_loader):
        y_hat = model(x)
        los = loss(y_hat.to(device), x[LABEL_INDEX].to(device))
        loss_li.append(los.item())
        y_hat_li.extend(np.argmax(y_hat.cpu().detach().numpy(), 1).tolist())
        y_li.extend(x[LABEL_INDEX].tolist())
    f1_eval = f1_score(y_li, y_hat_li, average=F1)
    acc = accuracy_score(y_li, y_hat_li)
    get_recall(y_li, y_hat_li, 'valid:▮ ')
    return round(f1_eval, ROUND), round(np.mean(loss_li), ROUND), round(acc, ROUND)

def train():
    f1_max = 0
    for e in range(EPOCH):
        model.train()
        y_hat_li, y_li, loss_li = list(), list(), list()
        for ind, x in enumerate(train_iter):
            model.train()
            y_hat = model(x)
            y_hat_li.extend(np.argmax(y_hat.cpu().detach().numpy(),1).tolist())
            y_li.extend(x[LABEL_INDEX].tolist())
            los = loss(y_hat.to(device),  x[LABEL_INDEX].to(device))
            optimizer.zero_grad()
            los.backward()
            optimizer.step()
            loss_li.append(los.item())
            if ind % 30 == 0 and ind != 0:
                f1_train = round(f1_score(y_li, y_hat_li, average=F1), ROUND)
                acc_train = round(accuracy_score(y_li, y_hat_li), ROUND)
                get_recall(y_li, y_hat_li, 'train: ')
                f1_valid, loss_valid, acc_valid = my_evaluate(model, valid_iter)
                line_str = 'epoch:{} | loss_train:{} | loss_valid:{} | acc_train: {} | acc_valid: {} | f1_train: {} | f1_valid: {}'
                print(line_str.format(e, round(np.mean(loss_li), ROUND), loss_valid, acc_train, acc_valid, f1_train, f1_valid))
                y_hat_li, y_li, loss_li = list(), list(), list()
                if f1_valid > f1_max:
                    torch.save(model, './model/model.pkl')
                    f1_max = f1_valid
                    print('-'*150, '模型保存--f1:{}'.format(f1_max))
                else:
                    print('-'*130)
train()

train:  [(0, [0.37, 10, 27]), (1, [0.0, 0, 5]), (2, [0.0, 0, 9]), (3, [0.889, 16, 18]), (4, [0.0, 0, 8]), (5, [0.0, 0, 12]), (6, [0.0, 0, 7]), (7, [0.0, 0, 14]), (8, [0.0, 0, 8]), (9, [0.111, 2, 18]), (10, [0.0, 0, 7]), (11, [0.0, 0, 7]), (12, [0.0, 0, 8]), (13, [0.0, 0, 7])]
valid:▮  [(0, [0.429, 3, 7]), (1, [0.0, 0, 3]), (3, [0.333, 1, 3]), (4, [0.0, 0, 1]), (5, [0.0, 0, 1]), (6, [0.0, 0, 3]), (7, [0.0, 0, 2]), (8, [0.0, 0, 1]), (9, [0.667, 2, 3]), (10, [0.0, 0, 4]), (11, [0.0, 0, 1]), (12, [0.0, 0, 1]), (13, [0.0, 0, 2])]
epoch:0 | loss_train:2.574 | loss_valid:2.458 | acc_train: 0.181 | acc_valid: 0.188 | f1_train: 0.057 | f1_valid: 0.094


  "type " + obj.__name__ + ". It won't be checked "


------------------------------------------------------------------------------------------------------------------------------------------------------ 模型保存--f1:0.094
train:  [(0, [1.0, 30, 30]), (1, [0.0, 0, 6]), (2, [0.0, 0, 7]), (3, [0.556, 10, 18]), (4, [0.0, 0, 7]), (5, [0.0, 0, 11]), (6, [0.0, 0, 7]), (7, [0.0, 0, 11]), (8, [0.0, 0, 10]), (9, [0.765, 13, 17]), (10, [0.0, 0, 7]), (11, [0.0, 0, 7]), (12, [0.0, 0, 10]), (13, [0.0, 0, 7])]
valid:▮  [(0, [1.0, 7, 7]), (1, [0.0, 0, 3]), (3, [0.0, 0, 3]), (4, [0.0, 0, 1]), (5, [0.0, 0, 1]), (6, [0.0, 0, 3]), (7, [0.0, 0, 2]), (8, [0.0, 0, 1]), (9, [1.0, 3, 3]), (10, [0.0, 0, 4]), (11, [0.0, 0, 1]), (12, [0.0, 0, 1]), (13, [0.5, 1, 2])]
epoch:1 | loss_train:2.284 | loss_valid:2.067 | acc_train: 0.342 | acc_valid: 0.344 | f1_train: 0.115 | f1_valid: 0.128
------------------------------------------------------------------------------------------------------------------------------------------------------ 模型保存--f1:0.128
train:  [(0, [1.0, 27

KeyboardInterrupt: 