In [1]:
import pandas as pd
import numpy as np
from util import *

In [2]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [56]:
def convert_to_subtags(tag):
    for subtag in tag.split('_'):
        yield subtag

In [60]:
def array_sublabels(row):
    labels = []
    for c in all_tiers:
        if c in row and row[c]:
            for subtag in convert_to_subtags(c):
              
                labels.append(subtag)
    return list(sorted(set(labels)))

In [62]:
training_set['sublabels'] = training_set[all_tiers].apply(array_sublabels, axis=1)
testing_set['sublabels'] = testing_set[all_tiers].apply(array_sublabels, axis=1)

In [84]:
@f.collecting
def get_children(tag):
    if tag == None:
        tag = ""
    for candidate in all_tiers:
        if candidate != tag and candidate.startswith(tag):
            yield candidate

In [87]:
children_dict = {tag: get_children(tag) for tag in all_tiers}

def has_child_tag(row, tag):
    return any(row[c] for c in children_dict[tag]) 

In [89]:
@f.collecting
def final_tag(row):
    for c in all_tiers:
        if c in row and row[c] == True and not has_child_tag(row, c):
            yield c
            

In [105]:
training_set['final_tags'] = training_set.apply(final_tag, axis=1)

In [110]:
testing_set['final_tags'] = testing_set.apply(final_tag, axis=1)

In [109]:
training_set.explode('final_tags')[['abstract', 'final_tags']].to_parquet("cte_tagged.parquet")

In [111]:
testing_set.explode('final_tags')[['abstract', 'final_tags']].to_parquet("cte_tagged_testing.parquet")

In [100]:
from sklearn.metrics import pairwise_distances

In [101]:
distances = pairwise_distances(training_set[all_tiers], training_set[all_tiers], metric="hamming")

In [102]:
distances

array([[0.        , 0.27941176, 0.19117647, ..., 0.26470588, 0.19117647,
        0.27941176],
       [0.27941176, 0.        , 0.14705882, ..., 0.04411765, 0.08823529,
        0.14705882],
       [0.19117647, 0.14705882, 0.        , ..., 0.16176471, 0.17647059,
        0.20588235],
       ...,
       [0.26470588, 0.04411765, 0.16176471, ..., 0.        , 0.10294118,
        0.13235294],
       [0.19117647, 0.08823529, 0.17647059, ..., 0.10294118, 0.        ,
        0.17647059],
       [0.27941176, 0.14705882, 0.20588235, ..., 0.13235294, 0.17647059,
        0.        ]])

In [95]:
from sklearn.preprocessing import MultiLabelBinarizer

In [96]:
mlb = MultiLabelBinarizer()

In [97]:
training_set['subtags'] = training_set[all_tiers].apply(array_sublabels, axis=1)

In [98]:
mlb.fit(training_set['subtags'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [99]:
mlb

MultiLabelBinarizer(classes=None, sparse_output=False)