In [10]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
#testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [5]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

nice_subset = [tier_translations[x] for x in subset]
nice_subset

['Analysis and Modeling',
 'Analysis and Modeling: 3D Modeling',
 'Anatomical Target',
 'Anatomical Target: Lower Extremity',
 'Anatomical Target: Lower Extremity - Hip',
 'Anatomical Target: Lower Extremity - Knee',
 'Anatomical Target: Torso',
 'Anatomical Target: Torso - Spine',
 'Anatomical Target: Upper Extremity',
 'Anatomical Target: Upper Extremity - Shoulder',
 'Imaging',
 'Imaging: CT',
 'Imaging: MRI',
 'Imaging: Ultrasound',
 'Manufacturing',
 'Manufacturing: Additive Manufacturing',
 'Personalized Product: Guide or Jig',
 'Personalized Product: Implant',
 'Specification of Use',
 'Specification of Use: Disease',
 'Specification of Use: Joint Replacement',
 'Surgical Method']

In [8]:
import funcy as f
@f.collecting
def create_examples(row):
    abstract = row.abstract
    claims = row.claims
    yield (abstract, claims, 1)
    for text in [abstract,claims]:
        for tag in subset:
            yield (text, f"[Tag]{tier_translations[tag]}", row[tag])

In [17]:
triplets = training_set.apply(create_examples, axis=1).explode()

In [18]:
triplets

0      (A biocompatible prosthetic device comprising ...
0      (A biocompatible prosthetic device comprising ...
0      (A biocompatible prosthetic device comprising ...
0      (A biocompatible prosthetic device comprising ...
0      (A biocompatible prosthetic device comprising ...
                             ...                        
971    (What is claimed is: \n     \n         1 . A t...
971    (What is claimed is: \n     \n         1 . A t...
971    (What is claimed is: \n     \n         1 . A t...
971    (What is claimed is: \n     \n         1 . A t...
971    (What is claimed is: \n     \n         1 . A t...
Length: 43740, dtype: object

In [24]:
from tqdm.auto import tqdm
def build_data_set(data_set): 
    frames = []
    for tier in [tier1, tier2, tier3, tier4]:
        for label in tqdm(set(tier) & set(all_tiers_100)):
            nice_label = tier_translations.get(label)
            if nice_label is None:
                continue
            positives = data_set[data_set[label]].abstract.to_frame("positive")
            positives['fk'] = 0
            positives['label'] = f"This is labelled: {nice_label}."
            negatives = data_set[~data_set[label]].abstract.to_frame("negative")
            negatives['fk'] = 0
            triplets = pd.merge(positives, negatives, how="outer", on="fk")
            frames.append(triplets[['label', 'positive', 'negative']])
    return pd.concat(frames)

In [25]:
training_examples = build_data_set(training_set)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [26]:
testing_examples= build_data_set(testing_set)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [27]:
training_examples.to_parquet("training_triplets.parquet")

In [28]:
testing_examples.to_parquet("testing_triplets.parquet")

In [29]:
training_examples.shape

(3701622, 3)

In [30]:
testing_examples.shape

(228012, 3)