In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import csv
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statistics

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from sklearn.model_selection import train_test_split

import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
# **Preprocessing data**
# Reading data
import json


#open the file, and format correctly
f =open('all.jsonl', 'r')
json_object = json.dumps(f.readlines(), indent=4)
f.close()

#save better format into file
p = open('sample.txt', 'w')
for i in json_object:
    p.write(i)
p.close()

#open new file, and save each 
j = open('sample.txt', 'r')
text = json.loads(j.read())
j.close()

#compile all json dicts into a list
info = []
for i in text:
    info.append(json.loads(str(i)))
    
def degreekify(char):
    #char will be a character
    greek = {'α': '[alpha]', 'β':'[beta]', 'γ': '[gamma]', 'δ':'[delta]', 'ε': '[epsilon]', 'ζ':'[zeta]', 'η': '[eta]',
            'θ':'[theta]', 'ι': '[iota]', 'κ':'[kappa]', 'λ':'[lambda]', 'μ': '[mu]', 'ν':'[nu]', 'ξ':'[xi]', 'ο':'[omicron]', 'π':'[pi]', 'ρ':'[rho]',
            'σ': '[sigma]', 'τ': '[tau]', 'υ':'[upsilon]', 'φ':'[phi]', 'χ':'[chi]', 'ψ':'[psi]', 'ω':'[omega]' }
    if char in greek:
        return greek[char]
    else:
        return char
def pre_process(text, annotations):
    #text will be the straight sentence, info[i]['text']
    #annotations will be the list of labels, must be info[i]['annotations']
    
    text_dict = []
    
    for i in range(len(text)):
        text_dict.append(degreekify(text[i]))
    
    
    
    ann_indices = []
    def fun(x):
        return x['start_offset']
    annotations.sort(key=fun)
    
    
    
    if len(annotations)==0:
        ann_indices.append([[0, len(text)],0])
    else:
        ann_indices.append([[0, annotations[0]['start_offset']], 0])
        for i in range(len(annotations)-1):
            ann_indices.append([[annotations[i]['start_offset'], annotations[i]['end_offset']], data_tags.index(annotations[i]['label'])])
            ann_indices.append([[annotations[i]['end_offset'], annotations[i+1]['start_offset']], 0])
            
        ann_indices.append([[annotations[-1]['start_offset'], annotations[-1]['end_offset']], data_tags.index(annotations[-1]['label'])])
        ann_indices.append([[annotations[-1]['end_offset'], len(text)], 0])
         
    
    labels = []
    sentences = []
    for a in ann_indices:
        
        if a[0][1]-a[0][0] !=0:
            together = ''
            for i in range(a[0][0], a[0][1]):
                together += text_dict[i]
                
            toke = together.split()
            sentences.extend(toke)
            t = len(toke)
            if t != 0:
                temp = [data_tags[a[1]+1]] * t
                if a[1] != 0:
                    temp[0] = data_tags[a[1]]
                labels.extend(temp)
 
    return labels, sentences
def reduce(sent, label, slist, llist):
    lens = len(sent)
    if lens < 256:
        slist.append(sent)
        llist.append(label)
    else:
        t = lens//2
        return reduce(sent[:t], label[:t], slist, llist), reduce(sent[t:], label[t:], slist, llist)
#create labels
data_tags = ['ahhhhhhhhhhhhhhhhhhhh','0','Metal', 'M-cont' , 'Element', 'E-cont', 'Acid', 'A-cont', 'Yield' , 'Y-cont', 'Separation Method' , 'S-cont', 'Resin', 'R-cont', 'Method of Analysis', 'T-cont', 'pH', 'P-cont', 'Chemical Compound', 'H-cont', 'Organic solvent', 'O-cont', 'Element Group', 'G-cont', 'Inorganic Solvent', 'I-cont', 'Flowrate', 'F-cont', 'Acid Concentration', 'C-cont', 'Reagent', 'X-cont']

sent_test, label_test = [], []
sentences, labels = [], []

for i in range(len(info)):
    l, s = pre_process(info[i]['text'], info[i]['entities'])
  
    if i % 5 == 0:
        reduce(s,l,sent_test, label_test)

    else:
        reduce(s,l,sentences, labels)


data_tags = data_tags[1:]


# Determine the list of tags
tag_values = data_tags
print(tag_values)

tag_values.append("PAD")
print(tag_values)

tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag2idx)
    
    
idx2tag = {value: key for key, value in tag2idx.items()}
bio_labels = [
    'O',
    'B-Metal',
    'I-Metal',
    'B-Element',
    'I-Element',
    'B-Acid',
    'I-Acid',
    'B-Yield',
    'I-Yield',
    'B-SeparationMethod',
    'I-SeparationMethod',
    'B-Resin',
    'I-Resin',
    'B-MethodOfAnalysis',
    'I-MethodOfAnalysis',
    'B-pH',
    'I-pH',
    'B-ChemicalCompound',
    'I-ChemicalCompound',
    'B-OrganicSolvent',
    'I-OrganicSolvent',
    'B-ElementGroup',
    'I-ElementGroup',
    'B-InorganicSolvent',
    'I-InorganicSolvent',
    'B-Flowrate',
    'I-Flowrate',
    'B-AcidConcentration',
    'I-AcidConcentration',
    'B-Reagent',
    'I-Reagent',
    'O'
]


['0', 'Metal', 'M-cont', 'Element', 'E-cont', 'Acid', 'A-cont', 'Yield', 'Y-cont', 'Separation Method', 'S-cont', 'Resin', 'R-cont', 'Method of Analysis', 'T-cont', 'pH', 'P-cont', 'Chemical Compound', 'H-cont', 'Organic solvent', 'O-cont', 'Element Group', 'G-cont', 'Inorganic Solvent', 'I-cont', 'Flowrate', 'F-cont', 'Acid Concentration', 'C-cont', 'Reagent', 'X-cont']
['0', 'Metal', 'M-cont', 'Element', 'E-cont', 'Acid', 'A-cont', 'Yield', 'Y-cont', 'Separation Method', 'S-cont', 'Resin', 'R-cont', 'Method of Analysis', 'T-cont', 'pH', 'P-cont', 'Chemical Compound', 'H-cont', 'Organic solvent', 'O-cont', 'Element Group', 'G-cont', 'Inorganic Solvent', 'I-cont', 'Flowrate', 'F-cont', 'Acid Concentration', 'C-cont', 'Reagent', 'X-cont', 'PAD']
{'0': 0, 'Metal': 1, 'M-cont': 2, 'Element': 3, 'E-cont': 4, 'Acid': 5, 'A-cont': 6, 'Yield': 7, 'Y-cont': 8, 'Separation Method': 9, 'S-cont': 10, 'Resin': 11, 'R-cont': 12, 'Method of Analysis': 13, 'T-cont': 14, 'pH': 15, 'P-cont': 16, 'Chemi

In [2]:
label_list = data_tags[1:-1:2]
label_list

['Metal',
 'Element',
 'Acid',
 'Yield',
 'Separation Method',
 'Resin',
 'Method of Analysis',
 'pH',
 'Chemical Compound',
 'Organic solvent',
 'Element Group',
 'Inorganic Solvent',
 'Flowrate',
 'Acid Concentration',
 'Reagent']

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Llama-2-7B-GPTQ"
# To use a different branch, change revision
# For example: revision="main"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             revision="gptq-4bit-32g-actorder_True")
#model = exllama_set_max_input_length(model, 4096)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)




In [4]:
info[0]['entities']

[{'id': 10627, 'label': 'Acid', 'start_offset': 41, 'end_offset': 63},
 {'id': 10626, 'label': 'Acid', 'start_offset': 65, 'end_offset': 69},
 {'id': 15947,
  'label': 'Separation Method',
  'start_offset': 76,
  'end_offset': 94},
 {'id': 15948,
  'label': 'Separation Method',
  'start_offset': 96,
  'end_offset': 99},
 {'id': 15949,
  'label': 'Separation Method',
  'start_offset': 104,
  'end_offset': 127},
 {'id': 15950,
  'label': 'Separation Method',
  'start_offset': 129,
  'end_offset': 133},
 {'id': 10632, 'label': 'Metal', 'start_offset': 139, 'end_offset': 144},
 {'id': 10633, 'label': 'Acid', 'start_offset': 178, 'end_offset': 199},
 {'id': 10625, 'label': 'Acid', 'start_offset': 231, 'end_offset': 241},
 {'id': 10624, 'label': 'Acid', 'start_offset': 242, 'end_offset': 248},
 {'id': 10634, 'label': 'Acid', 'start_offset': 302, 'end_offset': 307},
 {'id': 10635, 'label': 'Acid', 'start_offset': 312, 'end_offset': 334},
 {'id': 15951,
  'label': 'Separation Method',
  'start

In [20]:
def remove_id(entities):
    s = ""
    for e in entities:
        try:
            e.pop('id')
        except:
            pass
        s += f"{e}\n"

    return s

prompt = f"""Context: Please perform Named Entity Recognition (NER) on the following text with label entities [{"] [".join(label_list)}]: 
    Q: {info[0]['text']}
    A: {remove_id(info[0]['entities'])}
    Q {info[1]['text']}
    A: {remove_id(info[1]['entities'])}
    Q: {info[4]['text']}
    A: 
    """

In [21]:
prompt

"Context: Please perform Named Entity Recognition (NER) on the following text with label entities [Metal] [Element] [Acid] [Yield] [Separation Method] [Resin] [Method of Analysis] [pH] [Chemical Compound] [Organic solvent] [Element Group] [Inorganic Solvent] [Flowrate] [Acid Concentration] [Reagent]: \n    Q: Extraction systems based on extraction of tetrafluoroboric acid (HBF4) with tributyl phosphate (TBP) and triamylphosphine oxide (TAPO) for boron isotope separation were studied. Tetrafluoroboric acid was studied in comparison with boric acid (H3BO3). The extraction isotherms at 20°C for four systems (boric and tetrafluoroboric acids extracted with TBP and TAPO in o-xylene) were obtained, and single-stage isotope separation factors for these four systems were determined.\n    A: {'label': 'Acid', 'start_offset': 41, 'end_offset': 63}\n{'label': 'Acid', 'start_offset': 65, 'end_offset': 69}\n{'label': 'Separation Method', 'start_offset': 76, 'end_offset': 94}\n{'label': 'Separation 

### Direct use start and off position as prompt

In [None]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

*** Generate:
Context: Please perform Named Entity Recognition (NER) on the following text with label entities [Metal] [Element] [Acid] [Yield] [Separation Method] [Resin] [Method of Analysis] [pH] [Chemical Compound] [Organic solvent] [Element Group] [Inorganic Solvent] [Flowrate] [Acid Concentration] [Reagent]: 
    Q: Extraction systems based on extraction of tetrafluoroboric acid (HBF4) with tributyl phosphate (TBP) and triamylphosphine oxide (TAPO) for boron isotope separation were studied. Tetrafluoroboric acid was studied in comparison with boric acid (H3BO3). The extraction isotherms at 20°C for four systems (boric and tetrafluoroboric acids extracted with TBP and TAPO in o-xylene) were obtained, and single-stage isotope separation factors for these four systems were determined.
    A: {'label': 'Acid', 'start_offset': 41, 'end_offset': 63}
{'label': 'Acid', 'start_offset': 65, 'end_offset': 69}
{'label': 'Separation Method', 'start_offset': 76, 'end_offset': 94}
{'label': 'Separation Method', 'start_offset': 96, 'end_offset': 99}
{'label': 'Separation Method', 'start_offset': 104, 'end_offset': 127}
{'label': 'Separation Method', 'start_offset': 129, 'end_offset': 133}
{'label': 'Metal', 'start_offset': 139, 'end_offset': 144}
{'label': 'Acid', 'start_offset': 178, 'end_offset': 199}
{'label': 'Acid', 'start_offset': 231, 'end_offset': 241}
{'label': 'Acid', 'start_offset': 242, 'end_offset': 248}
{'label': 'Acid', 'start_offset': 302, 'end_offset': 307}
{'label': 'Acid', 'start_offset': 312, 'end_offset': 334}
{'label': 'Separation Method', 'start_offset': 350, 'end_offset': 354}
{'label': 'Separation Method', 'start_offset': 358, 'end_offset': 363}
{'label': 'Organic solvent', 'start_offset': 366, 'end_offset': 374}

    Q For example, materials based on 10B are demanded in nuclear power engineering, namely, in reactor control and protection systems and in production of steel for transportation racks and containers for spent nuclear fuel (SNF). Boron-10 is used in neutron capture therapy for cancer treatment.
    A: {'label': 'Element', 'start_offset': 32, 'end_offset': 35}
{'label': 'Element', 'start_offset': 226, 'end_offset': 234}

    Q: The mixture of nuclides, remaining after the separation of thorium isotopes contains 228Ra (in which 228Ac accumulates) and also 224Ra (in equilibrium with other decay products, including gaseous 220Rn) and the stable 208Pb. A single-stage isolation of 228Ac from a mixture of this kind is a complicated task, and deep purification to remove 224Ra is necessary for obtaining the radiochemically pure 228Ac. For this reason, the mixture was kept for one month till the complete decay of 224Ra. In the final stage, 228Ac was separated from 228Ra, 208Pb, and newly formed 228Th. For this purpose, a solution of 228Ra in HNO3 was evaporated to dryness in a glass flask on an electric hot plate, dissolved in 4 M HNO3; and the solution was placed in a column with RE extraction-chromatographic sorbent (Triskem Int., France, based on carbamoylmethylphosphine oxide and tributyl phosphate) with volume of 2 mL. 228Ra and 228Ac were separated by selective elution from the column: first, 228Ra was eluted with a 4 M HNO3 solution, and then 228Ac, in 0.05 M HNO3, with fractions collected each having a volume of 1 mL The purity of 228Ac was determined by registering its gamma-spectrum during three days. To perform experiments on sorption of actinium on CNMs, the 228Ac eluate in 0.05 M HNO3 was evaporated to dryness and dissolved in several mL of a phosphate buffered saline (PBS) with pH 7.
    A: 
    	{'label': 'Flowrate', 'start_offset': 37, 'end_offset': 39}
    	{'label': 'Acid Concentration', 'start_offset': 48, 'end_offset': 50}
    	{'label': 'Reagent', 'start_offset': 51, 'end_offset': 53}
    	{'label': 'pH', 'start_offset': 57, 'end_offset': 60}
    	{'label': 'Chemical Compound', 'start_offset': 61, 'end_offset': 63}
    	{'label': 'Organic solvent', 'start_offset': 64, 'end_offset': 67}
    	{'label': 'Element Group', 'start_offset': 68, 'end_offset': 70}
    	{'label': 'Inorganic solvent', 'start_offset': 71, 'end_offset': 73}
    	{'label': 'Resin', 'start_offset': 74, 'end_offset': 76}
    	{'label': 'Method of Analysis', 'start_offset': 77, 'end_offset': 79}
    	{'label': 'Element', 'start_offset': 80, 'end_offset': 81}
    	{'label': 'Acid', 'start_offset': 82, 'end_offset': 83}
    	{'label': 'Yield', 'start_offset': 84, 'end_offset': 86}
    	{'label': 'Element', 'start_offset': 87, 'end_offset': 89}
    	{'label': 'Acid', 'start_offset': 90, 'end_offset': 92}
    	{'label': 'Acid', 'start_offset': 93, 'end_offset': 96}
    	{'label': 'Resin', 'start_offset': 97, 'end_offset': 100}
    	{'label': 'Method of Analysis', 'start_offset': 101, 'end_offset':

### use label and text pairs

In [23]:
info[1]['text']

'For example, materials based on 10B are demanded in nuclear power engineering, namely, in reactor control and protection systems and in production of steel for transportation racks and containers for spent nuclear fuel (SNF). Boron-10 is used in neutron capture therapy for cancer treatment.'

In [24]:
info[1]['entities']

[{'label': 'Element', 'start_offset': 32, 'end_offset': 35},
 {'label': 'Element', 'start_offset': 226, 'end_offset': 234}]

In [27]:
info[1]['text'][226:234]

'Boron-10'

In [57]:
def prompt_generate(info_item):
    s = ""
    for e in info_item['entities']:
        temp_dict = dict()
        temp_dict['word'] = info_item['text'][e['start_offset']:e['end_offset']]
        # temp_dict['start_offset'] = e['start_offset']
        # temp_dict['end_offset'] = e['end_offset']
        temp_dict['label'] = e['label']

        
        s += f"{temp_dict}\n"

    return s

temp_info = info[2]

temp_info['text'] = temp_info['text'][:500]
temp_info['entities'] = [e for e in temp_info['entities'] if e['end_offset']<500]
prompt = f"""Context: Please perform Named Entity Recognition (NER) on the following text with label entities [{"] [".join(label_list)}]: 
Q: {info[0]['text']}
A: {prompt_generate(info[0])}
Q: {temp_info['text']}
A: {prompt_generate(temp_info)}
Q: {info[4]['text'][:500]}
A: 
    """
print(prompt)

Context: Please perform Named Entity Recognition (NER) on the following text with label entities [Metal] [Element] [Acid] [Yield] [Separation Method] [Resin] [Method of Analysis] [pH] [Chemical Compound] [Organic solvent] [Element Group] [Inorganic Solvent] [Flowrate] [Acid Concentration] [Reagent]: 
Q: Extraction systems based on extraction of tetrafluoroboric acid (HBF4) with tributyl phosphate (TBP) and triamylphosphine oxide (TAPO) for boron isotope separation were studied. Tetrafluoroboric acid was studied in comparison with boric acid (H3BO3). The extraction isotherms at 20°C for four systems (boric and tetrafluoroboric acids extracted with TBP and TAPO in o-xylene) were obtained, and single-stage isotope separation factors for these four systems were determined.
A: {'word': ' tetrafluoroboric acid', 'label': 'Acid'}
{'word': 'HBF4', 'label': 'Acid'}
{'word': 'tributyl phosphate', 'label': 'Separation Method'}
{'word': 'TBP', 'label': 'Separation Method'}
{'word': ' triamylphosph

In [53]:
prompt_generate(temp_info)

"{'word': 'Lead-203', 'start_offset': 0, 'end_offset': 8, 'label': 'Metal'}\n{'word': 'thallium', 'start_offset': 63, 'end_offset': 71, 'label': 'Element'}\n{'word': '203Pb', 'start_offset': 98, 'end_offset': 103, 'label': 'Element'}\n{'word': 'natTl', 'start_offset': 105, 'end_offset': 110, 'label': 'Element'}\n{'word': 'copper ', 'start_offset': 132, 'end_offset': 139, 'label': 'Element'}\n{'word': ' 1  M  HNO3', 'start_offset': 254, 'end_offset': 265, 'label': 'Acid Concentration'}\n{'word': 'Ion  exchange', 'start_offset': 268, 'end_offset': 281, 'label': 'Separation Method'}\n{'word': 'solvent  extraction', 'start_offset': 284, 'end_offset': 303, 'label': 'Separation Method'}\n{'word': ' nanomagnetic    ion-imprinted  polymer  methods  ', 'start_offset': 310, 'end_offset': 360, 'label': 'Separation Method'}\n{'word': '203Pb', 'start_offset': 389, 'end_offset': 394, 'label': 'Element'}\n{'word': 'thallium', 'start_offset': 402, 'end_offset': 410, 'label': 'Element'}\n{'word': 'γ-Ra

In [44]:
len(info[2]['text'])

794

In [58]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))



*** Generate:
<s> Context: Please perform Named Entity Recognition (NER) on the following text with label entities [Metal] [Element] [Acid] [Yield] [Separation Method] [Resin] [Method of Analysis] [pH] [Chemical Compound] [Organic solvent] [Element Group] [Inorganic Solvent] [Flowrate] [Acid Concentration] [Reagent]: 
Q: Extraction systems based on extraction of tetrafluoroboric acid (HBF4) with tributyl phosphate (TBP) and triamylphosphine oxide (TAPO) for boron isotope separation were studied. Tetrafluoroboric acid was studied in comparison with boric acid (H3BO3). The extraction isotherms at 20°C for four systems (boric and tetrafluoroboric acids extracted with TBP and TAPO in o-xylene) were obtained, and single-stage isotope separation factors for these four systems were determined.
A: {'word': ' tetrafluoroboric acid', 'label': 'Acid'}
{'word': 'HBF4', 'label': 'Acid'}
{'word': 'tributyl phosphate', 'label': 'Separation Method'}
{'word': 'TBP', 'label': 'Separation Method'}
{'wo