In [1]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
# We need to use the tokenizer manually since we need special tokens.

# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets



2023-06-04 04:29:02.129648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-04 04:29:02.300613: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-04 04:29:02.342534: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-04 04:29:03.081408: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

In [14]:
raw_text = "Coronavirus (COVID-19) information for job seekers\nExisiting job seekers\nIf you are a current job seeker or participant, this fact sheet provides\nimportant information about mutual obligation requirements, appointments with\nyour provider, and what to do if you are self-isolating:\n\nInformation for job seekers and participants\n\nIf you are participating in the ParentsNext program, this fact sheet provides\nimportant information about your activities and appointments.\n\n\nInformation for ParentsNext participants\n\n\nParentsNext participants Frequently Asked Questions\n\n\nIf you are a New Business Assistance with NEIS participant, these Frequently\nAsked Questions (FAQ) provides information about accessing the Coronavirus\nSupplement and what support is available during this time:\n\nNew Business Assistance with NEIS participants - Frequently Asked Questions\n\nIf you are a New Business Assistance with NEIS provider, these Frequently\nAsked Questions (FAQ) provides information about supporting NEIS participants\nduring the Coronavirus situation.\n\nNew Business Assistance with NEIS providers – Frequently Asked Questions\n\n*[NEIS]: New Enterprise Incentive Scheme"
extracted_text = triplet_extractor.tokenizer.batch_decode(triplet_extractor(raw_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"][0])
print(extracted_text[0])

<s><triplet> Coronavirus (COVID-19) information for job seekers <subj> fact sheet <obj> instance of</s>


In [15]:
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

[{'head': 'Coronavirus (COVID-19) information for job seekers', 'type': 'instance of', 'tail': 'fact sheet'}]


# Longer Sequences

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
gen_kwargs = {
    "max_length": 512,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
}

# Text to extract triplets from
text = 'Coronavirus (COVID-19) information for job seekers\nExisiting job seekers\nIf you are a current job seeker or participant, this fact sheet provides\nimportant information about mutual obligation requirements, appointments with\nyour provider, and what to do if you are self-isolating:\n\nInformation for job seekers and participants\n\nIf you are participating in the ParentsNext program, this fact sheet provides\nimportant information about your activities and appointments.\n\n\nInformation for ParentsNext participants\n\n\nParentsNext participants Frequently Asked Questions\n\n\nIf you are a New Business Assistance with NEIS participant, these Frequently\nAsked Questions (FAQ) provides information about accessing the Coronavirus\nSupplement and what support is available during this time:\n\nNew Business Assistance with NEIS participants - Frequently Asked Questions\n\nIf you are a New Business Assistance with NEIS provider, these Frequently\nAsked Questions (FAQ) provides information about supporting NEIS participants\nduring the Coronavirus situation.\n\nNew Business Assistance with NEIS providers – Frequently Asked Questions\n\n*[NEIS]: New Enterprise Incentive Scheme'

# Tokenizer text
model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors = 'pt')

# Generate
generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    **gen_kwargs,
)

# Extract text
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

# Extract triplets
for idx, sentence in enumerate(decoded_preds):
    print(f'Prediction triplets sentence {idx}')
    print(extract_triplets(sentence))


Prediction triplets sentence 0
[{'head': 'Coronavirus (COVID-19) information for job seekers', 'type': 'instance of', 'tail': 'fact sheet'}]
Prediction triplets sentence 1
[{'head': 'Coronavirus (COVID-19) information for job seekers', 'type': 'part of', 'tail': 'ParentsNext program'}]
Prediction triplets sentence 2
[{'head': 'Coronavirus (COVID-19) information for job seekers', 'type': 'instance of', 'tail': 'fact sheet'}, {'head': 'Coronavirus (COVID-19) information for job seekers', 'type': 'instance of', 'tail': 'fact sheet'}]
