## Dataset Construction

In [1]:
import sys
 
# setting path
sys.path.append('../')

dir = '../data/full data/'

import json
import os
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

from utils import elem2sent

import warnings
warnings.filterwarnings('ignore')

In [2]:
meta_data_df = pd.read_excel('../data/category_configuration_marked.xlsx', sheet_name = 'article_names_matching')
title2category = dict(zip(meta_data_df["Article Title"], meta_data_df["Category 2"]))

In [4]:
def get_df(dir):
    cnt = 0
    df = pd.DataFrame(columns=['article', 'category'])
    
    for file_name in os.listdir(dir):
        try:
            root = ET.parse(dir + file_name).getroot()
            meta_data = root[0]
            doc_body = root[1][2]
        except Exception as e:
            continue

        all_articles_with_title = [_ for _ in doc_body.findall(".//div[@type='article']") if 'title' in _.attrib]

        for article in all_articles_with_title:
            title = article.attrib['title'].lower()
            sentences = ''.join(elem2sent(article))
            category = title2category.get(title, 'unknown')
            df = df.append({'article': sentences, 'title':title, 'category': category}, ignore_index=True)
            cnt += 1

    return df

df = get_df(dir)
df.to_csv('generated_data/category_assignment_full.csv', index=False)


KeyboardInterrupt: 

In [5]:
df.head()

NameError: name 'df' is not defined

## Data preprocessing

The data preprocessing is done in the following steps:
1. drop all the articles that have no title
2. replace all the category with less than 5 articles with "other". This is done to reduce the number of categories and to avoid overfitting. Moreover, the categories with less than 5 articles are not very informative. The autogluon framework is unable to handle the categories with single instance. If we do not replace the categories with less than 5 articles, the model will not be able to predict the category of the test data.
3. replace the category names with the category id. This is done to make the model training easier.
4. split the data into train and test data. The test data is used to evaluate the model performance.

In [42]:
df_with_category = df[df['category'] != 'unknown']

#drop the title column
df_with_category = df_with_category.drop(columns=['title'])

#replace the category with less than 5 samples with 'others'
df_with_category['category'] = df_with_category['category'].apply(lambda x: 'others' if len(df_with_category[df_with_category['category'] == x]) < 5 else x)

#extract the category labels as a map
category2label = dict(zip(df_with_category['category'].unique(), range(len(df_with_category['category'].unique()))))
#save the map
with open('generated_data/category2label.json', 'w') as f:
    json.dump(category2label, f)

#replace the category with the label
df_with_category['category'] = df_with_category['category'].apply(lambda x: category2label[x])

#train test split
train_df, test_df = train_test_split(df_with_category, test_size=0.2, random_state=42)

#save the train and test data
train_df.to_csv('generated_data/category_train.csv', index=False)
test_df.to_csv('generated_data/category_test.csv', index=False)

## Test with a auto ml model as baseline

#### Load the data

In [3]:
import pandas as pd
train_df = pd.read_csv('../generated_data/category_train.csv')
test_df = pd.read_csv('../generated_data/category_test.csv')

In [4]:
train_df.category.value_counts()

11    2521
5     2106
7     2087
0     2013
9     2004
      ... 
56       6
43       5
60       5
46       5
38       3
Name: category, Length: 61, dtype: int64

#### Use autogluon as baseline

In [5]:
from autogluon.text import TextPredictor

baseline_predictor = TextPredictor(
    label='category', 
    problem_type='multiclass',
    eval_metric='f1_weighted', 
    path='./models'
)
baseline_predictor.fit(train_df, time_limit=None, hyperparameters={
            "model.hf_text.checkpoint_name": "bert-base-uncased",
    }, seed = 42)

Global seed set to 42
Auto select gpus: [0]
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                         | Params
-------------------------------------------------------------------
0 | model             | HFAutoModelForTextPrediction | 109 M 
1 | validation_metric | Accuracy                     | 0     
2 | loss_func         | CrossEntropyLoss             | 0     
-------------------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
219.058   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 97: 'val_accuracy' reached 0.88840 (best 0.88840), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=0-step=97.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 0, global step 195: 'val_accuracy' reached 0.91880 (best 0.91880), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=0-step=195.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 293: 'val_accuracy' reached 0.94800 (best 0.94800), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=1-step=293.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 391: 'val_accuracy' reached 0.95120 (best 0.95120), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=1-step=391.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 489: 'val_accuracy' reached 0.95120 (best 0.95120), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=2-step=489.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 587: 'val_accuracy' reached 0.95040 (best 0.95120), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=2-step=587.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 685: 'val_accuracy' reached 0.95840 (best 0.95840), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=3-step=685-v1.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 783: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 881: 'val_accuracy' reached 0.95800 (best 0.95840), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=4-step=881-v1.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 979: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 1077: 'val_accuracy' reached 0.95760 (best 0.95840), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=5-step=1077-v1.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 1175: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 1273: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 1371: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1469: 'val_accuracy' reached 0.96040 (best 0.96040), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=7-step=1469.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1567: 'val_accuracy' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 1665: 'val_accuracy' reached 0.95880 (best 0.96040), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=8-step=1665.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 1763: 'val_accuracy' reached 0.96040 (best 0.96040), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=8-step=1763.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 1861: 'val_accuracy' reached 0.96160 (best 0.96160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=9-step=1861.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 1959: 'val_accuracy' reached 0.96120 (best 0.96160), saving model to 'D:\\x\\cs\\project\\AI4LEGAL\\topic_classification\\models\\epoch=9-step=1959.ckpt' as top 3
Start to fuse 3 checkpoints via the greedy soup algorithm.


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

<autogluon.text.text_prediction.predictor.TextPredictor at 0x1fc0d2dac40>

#### Now we can test the model

In [6]:
test_score = baseline_predictor.evaluate(test_df, metrics=['acc', 'f1_weighted'])
print(test_score)

Predicting: 0it [00:00, ?it/s]

{'acc': 0.9596165020337013, 'f1_weighted': 0.9587883232546399}


In [14]:
#load category2label map 
with open('generated_data/category2label.json', 'r') as f:
    category2label = json.load(f)

sum(train_df.category == category2label['Fair and Equitable Treatment'])

76

In [15]:
sum(test_df.category == category2label['Fair and Equitable Treatment'])

15