In [115]:
import pandas as pd
import numpy as np
import glob
import json
import cv2

import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageColor
from tqdm import tqdm_notebook
from scipy.stats import rankdata

In [116]:
train_json = glob.glob('./学术文档篇章级结构恢复挑战赛公开数据/train-anno/*.json')
train_img = glob.glob('./学术文档篇章级结构恢复挑战赛公开数据/train-image/*/*.png')

test_json = glob.glob('./学术文档篇章级结构恢复挑战赛公开数据/test-anno/*.json')
test_img = glob.glob('./学术文档篇章级结构恢复挑战赛公开数据/test-image/*/*.png')

train_json.sort()
train_img.sort()

test_json.sort()
test_img.sort()

In [149]:
len(test_json), len(test_img)

(50, 735)

In [150]:
len(train_json), len(train_img)

(500, 7043)

In [151]:
train_json[:3]

['./学术文档篇章级结构恢复挑战赛公开数据/train-anno/0.json',
 './学术文档篇章级结构恢复挑战赛公开数据/train-anno/1.json',
 './学术文档篇章级结构恢复挑战赛公开数据/train-anno/10.json']

In [152]:
train_img[:3]

['./学术文档篇章级结构恢复挑战赛公开数据/train-image/0/0.png',
 './学术文档篇章级结构恢复挑战赛公开数据/train-image/0/1.png',
 './学术文档篇章级结构恢复挑战赛公开数据/train-image/0/2.png']

In [200]:
anns = json.load(open(train_json[1]))
anns[0]

{'text': 'Computationally Efficient Nonlinear Bell Inequalities for Quantum Networks',
 'box': [78, 72, 531, 84],
 'page': 0,
 'is_meta': True,
 'parent_id': -1,
 'relation': 'contain'}

In [173]:
train_img_shape = {x.split('/')[-2] + '/' + x.split('/')[-1][:-4] : Image.open(x).size[:2] for x in tqdm_notebook(train_img)}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_img_shape = {x.split('/')[-2] + '/' + x.split('/')[-1][:-4] : Image.open(x).size[:2] for x in tqdm_notebook(train_img)}


  0%|          | 0/7043 [00:00<?, ?it/s]

In [174]:
test_img_shape = {x.split('/')[-2] + '/' + x.split('/')[-1][:-4] : Image.open(x).size[:2] for x in tqdm_notebook(test_img)}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  test_img_shape = {x.split('/')[-2] + '/' + x.split('/')[-1][:-4] : Image.open(x).size[:2] for x in tqdm_notebook(test_img)}


  0%|          | 0/735 [00:00<?, ?it/s]

In [176]:
class Rect:
    def __init__(self,x,y,w,h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h

    def dist(self,other):
        if abs(self.x - other.x) <= (self.w + other.w):
            dx = 0;
        else:
            dx = abs(self.x - other.x) - (self.w + other.w)

        if abs(self.y - other.y) <= (self.h + other.h):
            dy = 0;
        else:
            dy = abs(self.y - other.y) - (self.h + other.h)
        
        return dx + dy
    
A = Rect(0,0,2,1)

In [186]:
features = []
node_lables = []

for ann_path in tqdm_notebook(train_json, total=len(train_json)):
    anns = json.load(open(ann_path))
    boxs = np.array([x['box'] for x in anns])
    
    rank_x = rankdata(boxs[:, 0], method='min')
    rank_y = rankdata(boxs[:, 1], method='min')
    
    rank_w = rankdata(boxs[:, 2] - boxs[:, 0], method='min')
    rank_h = rankdata(boxs[:, 3] - boxs[:, 1], method='min')
    
    ann_idx = ann_path.split('/')[-1][:-5]
    for idx, ann in enumerate(anns[:]):
        img_size = train_img_shape[ann_idx + '/' + str(ann['page'])]
        
        feat = [
            idx, 
            
            # 按照位置排序
            rank_x[idx], rank_y[idx], rank_w[idx], rank_h[idx],
            
            ann['box'][2] - ann['box'][0], # 宽度
            ann['box'][3] - ann['box'][1], # 高度
            (ann['box'][2] - ann['box'][0]) / (1 + ann['box'][3] - ann['box'][1]), # 长宽比
            
            ann['box'][0] / img_size[0], ann['box'][2] / img_size[0], # 位置百分比
            ann['box'][1] / img_size[1], ann['box'][2] / img_size[1],
            
            (ann['box'][0] + ann['box'][2]) / 2 / img_size[0], # 中心位置
            (ann['box'][1] + ann['box'][2]) / 2 / img_size[1],
            
            # 统计位置重合的
            sum(boxs[idx][0] == boxs[:, 0]),
            sum(boxs[idx][1] == boxs[:, 1]),
            sum(boxs[idx][2] == boxs[:, 2]),
            sum(boxs[idx][3] == boxs[:, 3]),
            
            len(ann['text']), ann['text'].count(' '), ann['text'].count('.'), # 字符统计
            ann['text'].islower(), ann['text'].isupper(), ann['text'].istitle(),
            ann['text'].endswith('.'), ann['text'].endswith('?'),
            ann['text'].startswith('['),
            ann['text'][1:].islower(), ann['text'][1:].isupper(), ann['text'][1:].istitle(),
            
            (ann['box'][2] - ann['box'][0]) / (len(ann['text']) + 1) # 字符宽度
        ]
        features.append(feat)
        
        if ann['parent_id'] == -1:
            node_lables.append(ann['relation'] + '-1')
        else:
            node_lables.append(ann['relation'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for ann_path in tqdm_notebook(train_json, total=len(train_json)):


  0%|          | 0/500 [00:00<?, ?it/s]

In [187]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [188]:
pred = cross_val_predict(
    LGBMClassifier(n_estimators=20),
    np.array(features),
    np.array(node_lables)
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4919
[LightGBM] [Info] Number of data points in the train set: 253801, number of used features: 31
[LightGBM] [Info] Start training from score -0.281981
[LightGBM] [Info] Start training from score -3.281686
[LightGBM] [Info] Start training from score -2.892079
[LightGBM] [Info] Start training from score -1.879368
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4920
[LightGBM] [Info] Number of data points in the train set: 253801, number of used features: 31
[LightGBM] [Info] Start training from score -0.281986
[LightGBM] [Info] Start training from score -3.281686
[LightGBM] [Info] Start training from score -2.892008
[LightGBM] [Info] Start training from score -1.879368
You can set `force_row_wise=true` to remove the overhead.
And if mem

In [189]:
print(classification_report(np.array(node_lables), pred))

              precision    recall  f1-score   support

     connect       0.90      0.98      0.94    239298
     contain       0.62      0.06      0.10     11918
   contain-1       0.91      0.84      0.88     17596
    equality       0.81      0.66      0.73     48440

    accuracy                           0.89    317252
   macro avg       0.81      0.64      0.66    317252
weighted avg       0.88      0.89      0.87    317252



In [190]:
model = LGBMClassifier(n_estimators=200)
model.fit(np.array(features), np.array(node_lables))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4945
[LightGBM] [Info] Number of data points in the train set: 317252, number of used features: 31
[LightGBM] [Info] Start training from score -0.281987
[LightGBM] [Info] Start training from score -3.281647
[LightGBM] [Info] Start training from score -2.892025
[LightGBM] [Info] Start training from score -1.879370


In [193]:
test_features = []

for ann_path in tqdm_notebook(test_json, total=len(test_json)):
    anns = json.load(open(ann_path))
    boxs = np.array([x['box'] for x in anns])
    
    rank_x = rankdata(boxs[:, 0], method='min')
    rank_y = rankdata(boxs[:, 1], method='min')
    
    rank_w = rankdata(boxs[:, 2] - boxs[:, 0], method='min')
    rank_h = rankdata(boxs[:, 3] - boxs[:, 1], method='min')
    
    ann_idx = ann_path.split('/')[-1][:-5]
    for idx, ann in enumerate(anns[:]):
        img_size = test_img_shape[ann_idx + '/' + str(ann['page'])]
        
        feat = [
            idx, 
            
            # 按照位置排序
            rank_x[idx], rank_y[idx], rank_w[idx], rank_h[idx],
            
            ann['box'][2] - ann['box'][0], # 宽度
            ann['box'][3] - ann['box'][1], # 高度
            (ann['box'][2] - ann['box'][0]) / (1 + ann['box'][3] - ann['box'][1]), # 长宽比
            
            ann['box'][0] / img_size[0], ann['box'][2] / img_size[0], # 位置百分比
            ann['box'][1] / img_size[1], ann['box'][2] / img_size[1],
            
            (ann['box'][0] + ann['box'][2]) / 2 / img_size[0], # 中心位置
            (ann['box'][1] + ann['box'][2]) / 2 / img_size[1],
            
            # 统计位置重合的
            sum(boxs[idx][0] == boxs[:, 0]),
            sum(boxs[idx][1] == boxs[:, 1]),
            sum(boxs[idx][2] == boxs[:, 2]),
            sum(boxs[idx][3] == boxs[:, 3]),
            
            len(ann['text']), ann['text'].count(' '), ann['text'].count('.'), # 字符统计
            ann['text'].islower(), ann['text'].isupper(), ann['text'].istitle(),
            ann['text'].endswith('.'), ann['text'].endswith('?'),
            ann['text'].startswith('['),
            ann['text'][1:].islower(), ann['text'][1:].isupper(), ann['text'][1:].istitle(),
            
            (ann['box'][2] - ann['box'][0]) / (len(ann['text']) + 1) # 字符宽度
        ]
        test_features.append(feat)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for ann_path in tqdm_notebook(test_json, total=len(test_json)):


  0%|          | 0/50 [00:00<?, ?it/s]

In [195]:
preds = model.predict(np.array(test_features))

In [206]:
test_ann_index = 0
for ann_path in tqdm_notebook(test_json, total=len(test_json)):
    anns = json.load(open(ann_path))
    
    for idx, ann in enumerate(anns):        
        if '-1' in preds[test_ann_index]:
            anns[idx]['relation'] = 'contain'
            anns[idx]['parent_id'] = -1
        else:
            anns[idx]['relation'] = preds[test_ann_index]
            anns[idx]['parent_id'] = idx - 1

        
        test_ann_index += 1
    
    with open('./submit/' + ann_path.split('/')[-1], 'w') as up:
        json.dump(anns, up)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for ann_path in tqdm_notebook(test_json, total=len(test_json)):


  0%|          | 0/50 [00:00<?, ?it/s]

In [207]:
!\rm submit.zip
!zip -r submit.zip submit

rm: 无法删除'submit.zip': 没有那个文件或目录
  adding: submit/ (stored 0%)
  adding: submit/27.json (deflated 78%)
  adding: submit/4.json (deflated 74%)
  adding: submit/10.json (deflated 77%)
  adding: submit/29.json (deflated 76%)
  adding: submit/47.json (deflated 76%)
  adding: submit/5.json (deflated 77%)
  adding: submit/9.json (deflated 75%)
  adding: submit/6.json (deflated 81%)
  adding: submit/34.json (deflated 76%)
  adding: submit/3.json (deflated 78%)
  adding: submit/1.json (deflated 78%)
  adding: submit/0.json (deflated 77%)
  adding: submit/25.json (deflated 78%)
  adding: submit/23.json (deflated 76%)
  adding: submit/11.json (deflated 75%)
  adding: submit/22.json (deflated 77%)
  adding: submit/49.json (deflated 77%)
  adding: submit/28.json (deflated 73%)
  adding: submit/13.json (deflated 78%)
  adding: submit/38.json (deflated 79%)
  adding: submit/36.json (deflated 74%)
  adding: submit/35.json (deflated 78%)
  adding: submit/16.json (deflated 75%)
  adding: submit/21.json 