## Dataset Construction

In [39]:
dir = 'data/full data/'

import json
import os
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

from utils import elem2sent

import warnings
warnings.filterwarnings('ignore')

In [4]:
meta_data_df = pd.read_excel('data/category_configuration_marked.xlsx', sheet_name = 'article_names_matching')
title2category = dict(zip(meta_data_df["Article Title"], meta_data_df["Category 2"]))

In [6]:
from unicodedata import category


def get_df(dir):
    cnt = 0
    df = pd.DataFrame(columns=['article', 'category'])
    
    for file_name in os.listdir(dir):
        try:
            root = ET.parse(dir + file_name).getroot()
            meta_data = root[0]
            doc_body = root[1][2]
        except Exception as e:
            continue

        all_articles_with_title = [_ for _ in doc_body.findall(".//div[@type='article']") if 'title' in _.attrib]

        for article in all_articles_with_title:
            title = article.attrib['title'].lower()
            sentences = ''.join(elem2sent(article))
            category = title2category.get(title, 'unknown')
            df = df.append({'article': sentences, 'title':title, 'category': category}, ignore_index=True)
            cnt += 1

    return df

df = get_df(dir)
df.to_csv('generated_data/category_assignment_full.csv', index=False)


In [7]:
df.head()

Unnamed: 0,article,category,title
0,If either Contracting Party makes payment to a...,unknown,s
1,Where a matter is governed both by this Conven...,unknown,article §
2,"For the purposes of this Agreement(a) ""investm...",Definition,definitions
3,(1) Each Contracting Party shall encourage and...,Promotion and Admission,promotion and protection of investment
4,(1) Neither Contracting Party shall in its ter...,Most-Favoured-Nation Treatment,most-favoured-nation provisions


### Data preprocessing

In [42]:
df_with_category = df[df['category'] != 'unknown']

#drop the title column
df_with_category = df_with_category.drop(columns=['title'])

#replace the category with less than 5 samples with 'others'
df_with_category['category'] = df_with_category['category'].apply(lambda x: 'others' if len(df_with_category[df_with_category['category'] == x]) < 5 else x)

#extract the category labels as a map
category2label = dict(zip(df_with_category['category'].unique(), range(len(df_with_category['category'].unique()))))
#save the map
with open('generated_data/category2label.json', 'w') as f:
    json.dump(category2label, f)

#replace the category with the label
df_with_category['category'] = df_with_category['category'].apply(lambda x: category2label[x])

#train test split
train_df, test_df = train_test_split(df_with_category, test_size=0.2, random_state=42)

#save the train and test data
train_df.to_csv('generated_data/category_train.csv', index=False)
test_df.to_csv('generated_data/category_test.csv', index=False)

## Test with a auto ml model as baseline

#### Load the data

In [43]:
train_df = pd.read_csv('generated_data/category_train.csv')
test_df = pd.read_csv('generated_data/category_test.csv')

In [46]:
from autogluon.text import TextPredictor

predictor = TextPredictor(
    label='category', 
    problem_type='multiclass',
    eval_metric='f1_micro', 
    path='./models'
)
predictor.fit(train_df, time_limit=None, presets='medium_quality_faster_train')

Global seed set to 123
Downloading: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
Downloading: 100%|██████████| 665/665 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 1.65MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 1.99MB/s]
Downloading: 100%|██████████| 51.7M/51.7M [00:01<00:00, 48.9MB/s]
Auto select gpus: [0]
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                         | Params
-------------------------------------------------------------------
0 | model             | HFAutoModelForTextPrediction | 13.5 M
1 | validation_metric | Accuracy                     | 0     
2 | loss_func         | CrossEntropyLoss             | 0     
-------------------------------------------------------------------
13.5 M    Trainable params


Epoch 0:  50%|████▉     | 1877/3755 [01:46<01:46, 17.69it/s, loss=0.613, v_num=]

Epoch 0, global step 97: 'val_accuracy' reached 0.86640 (best 0.86640), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=0-step=97.ckpt' as top 3


Epoch 0: 100%|█████████▉| 3754/3755 [03:25<00:00, 18.23it/s, loss=0.445, v_num=]

Epoch 0, global step 195: 'val_accuracy' reached 0.91640 (best 0.91640), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=0-step=195.ckpt' as top 3


Epoch 1:  50%|████▉     | 1877/3755 [01:47<01:47, 17.52it/s, loss=0.351, v_num=] 

Epoch 1, global step 293: 'val_accuracy' reached 0.92680 (best 0.92680), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=1-step=293.ckpt' as top 3


Epoch 1: 100%|█████████▉| 3754/3755 [03:26<00:00, 18.18it/s, loss=0.333, v_num=] 

Epoch 1, global step 391: 'val_accuracy' reached 0.93280 (best 0.93280), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=1-step=391.ckpt' as top 3


Epoch 2:  50%|████▉     | 1877/3755 [01:46<01:46, 17.69it/s, loss=0.219, v_num=] 

Epoch 2, global step 489: 'val_accuracy' reached 0.93840 (best 0.93840), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=2-step=489.ckpt' as top 3


Epoch 2: 100%|█████████▉| 3754/3755 [03:23<00:00, 18.42it/s, loss=0.292, v_num=] 

Epoch 2, global step 587: 'val_accuracy' reached 0.94080 (best 0.94080), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=2-step=587.ckpt' as top 3


Epoch 3:  50%|████▉     | 1877/3755 [01:47<01:47, 17.51it/s, loss=0.184, v_num=] 

Epoch 3, global step 685: 'val_accuracy' reached 0.94280 (best 0.94280), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=3-step=685.ckpt' as top 3


Epoch 3: 100%|█████████▉| 3754/3755 [03:25<00:00, 18.28it/s, loss=0.243, v_num=] 

Epoch 3, global step 783: 'val_accuracy' reached 0.94120 (best 0.94280), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=3-step=783.ckpt' as top 3


Epoch 4:  50%|████▉     | 1877/3755 [01:47<01:47, 17.52it/s, loss=0.137, v_num=] 

Epoch 4, global step 881: 'val_accuracy' reached 0.94520 (best 0.94520), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=4-step=881.ckpt' as top 3


Epoch 4: 100%|█████████▉| 3754/3755 [03:27<00:00, 18.13it/s, loss=0.184, v_num=] 

Epoch 4, global step 979: 'val_accuracy' reached 0.94400 (best 0.94520), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=4-step=979.ckpt' as top 3


Epoch 5:  50%|████▉     | 1877/3755 [01:47<01:47, 17.51it/s, loss=0.163, v_num=] 

Epoch 5, global step 1077: 'val_accuracy' reached 0.94600 (best 0.94600), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=5-step=1077.ckpt' as top 3


Epoch 5: 100%|█████████▉| 3754/3755 [03:28<00:00, 18.04it/s, loss=0.2, v_num=]   

Epoch 5, global step 1175: 'val_accuracy' reached 0.94480 (best 0.94600), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=5-step=1175.ckpt' as top 3


Epoch 6:  50%|████▉     | 1877/3755 [01:47<01:47, 17.40it/s, loss=0.0863, v_num=]

Epoch 6, global step 1273: 'val_accuracy' reached 0.94800 (best 0.94800), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=6-step=1273.ckpt' as top 3


Epoch 6: 100%|█████████▉| 3754/3755 [03:28<00:00, 18.04it/s, loss=0.189, v_num=] 

Epoch 6, global step 1371: 'val_accuracy' reached 0.94600 (best 0.94800), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=6-step=1371.ckpt' as top 3


Epoch 7:  50%|████▉     | 1877/3755 [01:45<01:45, 17.86it/s, loss=0.0783, v_num=] 

Epoch 7, global step 1469: 'val_accuracy' reached 0.95160 (best 0.95160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=7-step=1469.ckpt' as top 3


Epoch 7: 100%|█████████▉| 3754/3755 [03:24<00:00, 18.35it/s, loss=0.173, v_num=]  

Epoch 7, global step 1567: 'val_accuracy' reached 0.95000 (best 0.95160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=7-step=1567.ckpt' as top 3


Epoch 8:  50%|████▉     | 1877/3755 [01:46<01:46, 17.56it/s, loss=0.0956, v_num=] 

Epoch 8, global step 1665: 'val_accuracy' reached 0.95080 (best 0.95160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=8-step=1665.ckpt' as top 3


Epoch 8: 100%|█████████▉| 3754/3755 [03:26<00:00, 18.17it/s, loss=0.169, v_num=]  

Epoch 8, global step 1763: 'val_accuracy' was not in top 3


Epoch 9:  50%|████▉     | 1877/3755 [01:46<01:46, 17.71it/s, loss=0.0627, v_num=] 

Epoch 9, global step 1861: 'val_accuracy' reached 0.95160 (best 0.95160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=9-step=1861.ckpt' as top 3


Epoch 9: 100%|█████████▉| 3754/3755 [03:25<00:00, 18.24it/s, loss=0.165, v_num=]  

Epoch 9, global step 1959: 'val_accuracy' reached 0.95160 (best 0.95160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\models\\epoch=9-step=1959.ckpt' as top 3


Epoch 9: 100%|██████████| 3755/3755 [03:26<00:00, 18.16it/s, loss=0.16, v_num=] 


Start to fuse 3 checkpoints via the greedy soup algorithm.


Predicting DataLoader 0: 100%|██████████| 79/79 [00:03<00:00, 25.16it/s]
Predicting DataLoader 0: 100%|██████████| 79/79 [00:03<00:00, 24.05it/s]
Predicting DataLoader 0: 100%|██████████| 79/79 [00:03<00:00, 24.34it/s]


<autogluon.text.text_prediction.predictor.TextPredictor at 0x1640d55e610>

In [48]:
test_score = predictor.evaluate(test_df, metrics=['acc', 'f1_macro'])
print(test_score)

Predicting DataLoader 0: 100%|██████████| 216/216 [00:07<00:00, 28.44it/s]
{'acc': 0.9564206856478792, 'f1_macro': 0.8612385243420871}
