In [None]:
!pip install transformers

# transformers：machine learning models for Natural Language Processing (NLP) tasks like text classification, information extraction, question answering, and more.



In [None]:
!pip install datasets

 # used for accessing and managing large collections of datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
!pip install islab-opendeid

 # support de-identification. De-identification is a process aimed at removing or obscuring personally identifiable information from data sets to protect individual privacy.

Collecting islab-opendeid
  Downloading islab_opendeid-0.0.1.1-py3-none-any.whl (3.0 kB)
Installing collected packages: islab-opendeid
Successfully installed islab-opendeid-0.0.1.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### import package

In [None]:
import os                                                                       # interacting with the file system, like reading, writing, and modifying file and directory paths

import numpy as np                                                              # working with arrays and matrices of numbers
from tqdm import tqdm, trange                                                   # displaying progress bars in loops
from torch.optim import AdamW                                                   # an optimization algorithm often used in deep learning for training neural networks

from torch.utils.data import DataLoader                                         # efficiently loading and processing data in batches during model training
import torch                                                                    # a widely used library for machine learning and deep learning
import torch.optim as optim                                                     # various optimization algorithms for training neural networks
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup #automatic causal language models, automatic tokenization, model configuration, a scheduler for the learning rate

import re
import random
import matplotlib.pyplot as plt
from torch.nn import functional as F                                            # building neural networks
from torch.utils.data import Dataset
import multiprocessing                                                          # parallelize tasks and potentially speed up execution

In [None]:
# set a fixed random seed to ensure reproducibility in machine learning or deep learning tasks using PyTorch.
def set_torch_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True                                   # Ensures that the behavior of Convolutional Neural Networks (CNNs) is deterministic
    torch.backends.cudnn.benckmark = False                                      # Prevents PyTorch from performing automatic optimizations, which might introduce randomness
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_torch_seed()

# return the contents of the entire file as a list of strings, where each element of the list represents a line in the file
def read_file(path):
    with open(path , 'r' , encoding = 'utf-8-sig') as fr:
        return fr.readlines()

### 資料處理

In [None]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
ner = '\n\n####\n\n'
special_tokens_dict ={'bos_token': bos,
            'eos_token': eos,
            'pad_token': pad,
            'sep_token': ner}

def process_annotation_file(lines):
    entity_dict = {}                                                            # an empty dictionary to store the information

    # Separate by the number of elements in each line
    for line in lines:
        items = line.strip('\n').split('\t')
        if len(items) == 5:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
            }
        elif len(items) == 6:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
                'normalize_time' : items[5],
            }

        # Check if the file has been read
        if items[0] not in entity_dict:
            entity_dict[items[0]] = [item_dict]
        else:
            entity_dict[items[0]].append(item_dict)                             # add a sub dictionary under the main dictionary

    return entity_dict

def process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict):   # txt_name = 10, 11, 12....  annos_dict = entity_dict
    file_name = txt_name + '.txt'
    sents = read_file(os.path.join(medical_report_folder, file_name))           # foder path + file path
    article = "".join(sents)

    bounary , item_idx , temp_seq , seq_pairs = 0 , 0 , "" , []
    new_line_idx = 0
    flag= 0

    for w_idx, word in enumerate(article):

      # When read till the end of the sentence, it starts to progress the record.
      if word == '\n':
        new_line_idx = w_idx + 1
        if article[bounary:new_line_idx] == '\n':
            continue
        if temp_seq == "":
            temp_seq = "PHI:Null"
            sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
            temp_seq = temp_seq.strip('\\n')
            seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{temp_seq}\n"       # file23060	25	199903040000	DATE:19990304=>1999-03-04
            bounary = new_line_idx
            seq_pairs.append(seq_pair)
            temp_seq = ""
        else:
            sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
            s=''
            seq_pair = []
            flag = 0
            for i in temp_seq:
              # print('i:', i)
              if flag == 1:
                flag = 0
                continue
              if i == '\\':
                flag = 1
                # print(i)
                seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{s}\n"
                seq_pairs.append(seq_pair)
                s=''
                continue
              s += i
            bounary = new_line_idx
            temp_seq = ""
        # sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
        # temp_seq = temp_seq.strip('\\n')
        # seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{temp_seq}\n"       # file23060	25	199903040000	DATE:19990304=>1999-03-04
        # bounary = new_line_idx
        # seq_pairs.append(seq_pair)
        # print(temp_seq)
        # temp_seq = ""


      # When the word is the phi_value, it teaches the model this is.
      if w_idx == annos_dict[txt_name][item_idx]['st_idx']:
        phi_key = annos_dict[txt_name][item_idx]['phi']
        phi_value = annos_dict[txt_name][item_idx]['entity']
        if 'normalize_time' in annos_dict[txt_name][item_idx]:
            temp_seq += f"{phi_key}:{phi_value}=>{annos_dict[txt_name][item_idx]['normalize_time']}\\n"
        else:
            temp_seq += f"{phi_key}:{phi_value}\\n"
        if item_idx == len(annos_dict[txt_name]) - 1:
            continue
        item_idx += 1

    return seq_pairs

def generate_annotated_medical_report_parallel(anno_file_path, medical_report_folder , tsv_output_path , num_processes=4):
    anno_lines = read_file(anno_file_path)
    annos_dict = process_annotation_file(anno_lines)
    txt_names = list(annos_dict.keys())                                         # retuen a list of file names

    all_seq_pairs = []
    for txt_name in txt_names:
        all_seq_pairs.extend(process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict))
    with open(tsv_output_path , 'w' , encoding = 'utf-8') as fw:
        for seq_pair in all_seq_pairs:
            print(seq_pair)
            fw.write(seq_pair)

def generate_annotated_medical_report_parallel2(anno_file_path, medical_report_folder , tsv_output_path , num_processes=4):
    anno_lines = read_file(anno_file_path)
    annos_dict = process_annotation_file(anno_lines)
    txt_names = list(annos_dict.keys())                                         # retuen a list of file names

    all_seq_pairs = []
    for txt_name in txt_names:
        all_seq_pairs.extend(process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict))
    with open(tsv_output_path , 'a' , encoding = 'utf-8') as fw:
        for seq_pair in all_seq_pairs:
            print(seq_pair)
            fw.write(seq_pair)

# anno_info_path = r"/content/drive/MyDrive/aicup/First_Phase_Release/answer.txt"
# report_folder = r"/content/drive/MyDrive/aicup/First_Phase_Release/First_Phase_Text_Dataset"
# tsv_output_path = r"/content/drive/MyDrive/aicup/First_Phase_Release/train.tsv"
# generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)

###############################################################################################
anno_info = r"/content/drive/MyDrive/aicup/First_Phase_Release/answer.txt" #, r"/content/drive/MyDrive/aicup/Second_Phase_Dataset/answer.txt"
report = r"/content/drive/MyDrive/aicup/First_Phase_Release/First_Phase_Text_Dataset" #, r"/content/drive/MyDrive/aicup/Second_Phase_Dataset/Second_Phase_Text_Dataset"
anno_info2 = r"/content/drive/MyDrive/aicup/Second_Phase_Dataset/answer.txt"
report2 = r"/content/drive/MyDrive/aicup/Second_Phase_Dataset/Second_Phase_Text_Dataset"
anno_info3 = r"/content/drive/MyDrive/aicup/Validation_Dataset/answer.txt"
report3 = r"/content/drive/MyDrive/aicup/Validation_Dataset/Validation_Release"

# for i in range (len(anno_info)):
anno_info_path = anno_info
report_folder = report
anno_info_path2 = anno_info2
report_folder2 = report2
anno_info_path3 = anno_info3
report_folder3 = report3
tsv_output_path = r"/content/drive/MyDrive/aicup/train/train.tsv"
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)
generate_annotated_medical_report_parallel2(anno_info_path2, report_folder2, tsv_output_path, num_processes=4)
generate_annotated_medical_report_parallel2(anno_info_path3, report_folder3, tsv_output_path, num_processes=4)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m

434	138	NARRABRI  Australian Capital Territory  5522	STATE:Australian Capital Territory

434	138	NARRABRI  Australian Capital Territory  5522	ZIP:5522

434	155	Specimen: Tissue	PHI:Null

434	173	D.O.B:  17/3/1988	DATE:17/3/1988=>1988-03-17

434	181	Sex:  F	PHI:Null

434	212	Collected: 02/05/2013 at 15:04	TIME:02/05/2013 at 15:04=>2013-05-02T15:04

434	260	Location:  Hunter Area - BYRON CENTRAL HOSPITAL	DEPARTMENT:Hunter Area

434	260	Location:  Hunter Area - BYRON CENTRAL HOSPITAL	HOSPITAL:BYRON CENTRAL HOSPITAL

434	293	DR NORBERT STEFAN ROBERTO-MATTES	DOCTOR:NORBERT STEFAN ROBERTO-MATTES

434	342	Distribution:   FILE-COPY,   NSW-CANCER-REGISTRY	PHI:Null

434	352	CLINICAL:	PHI:Null

434	614	Rt breast lump @ 10 o'clock 22cm FN - malignant on core bx.  WLE + sentinel node biopsy.  HTN, thyroid oblation.  Rt breast lump (10 o'c, 2cm FN) -short stitch superior, long lateral, medium medial.  Rt axillary sentinel nodes x 3 (2 x blue, 1x hot and blue).	PHI

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m

1606	1701	F/S : Result for specimens A-C given to anaesthetist by Dr. Michal at 10:10hrs on 20/3/14.  Result for specimen D&E given by phone to Prof. Flori at1155Hrs.  Result for specimen F given by phone to Prof. Sensabaugh at 1310Hrs.	DOCTOR:Flori

1606	1701	F/S : Result for specimens A-C given to anaesthetist by Dr. Michal at 10:10hrs on 20/3/14.  Result for specimen D&E given by phone to Prof. Flori at1155Hrs.  Result for specimen F given by phone to Prof. Sensabaugh at 1310Hrs.	TIME:1155Hrs=>2014-03-20T11:55

1606	1701	F/S : Result for specimens A-C given to anaesthetist by Dr. Michal at 10:10hrs on 20/3/14.  Result for specimen D&E given by phone to Prof. Flori at1155Hrs.  Result for specimen F given by phone to Prof. Sensabaugh at 1310Hrs.	DOCTOR:Sensabaugh

1606	1701	F/S : Result for specimens A-C given to anaesthetist by Dr. Michal at 10:10hrs on 20/3/14.  Result for specimen D&E given by phone to Prof. Flori at1155Hrs.  Result for specimen 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m

file2179	4287	DCIS extends within 0.3mm of the superior and 1mm of posterior margin without intervening  normal ducts.	PHI:Null

file2179	4375	Completeness of excision will depend on spatial relationship between specimens B and C.	PHI:Null

file2179	4386	&amp;#160;	PHI:Null

file2179	4403	ANCILLARY TESTS:	PHI:Null

file2179	4428	IMMUNOSTAINS, block - B5	PHI:Null

file2179	4476	HORMONE RECEPTORS (IMMUNOPEROXIDASE TECHNIQUE)	PHI:Null

file2179	4531	OESTROGEN RECEPTOR:         positive(90% of cells +++)	PHI:Null

file2179	4582	PROGESTERONE RECEPTOR:  positive(90% of cells +++)	PHI:Null

file2179	4606	HER-2 IMMUNOPEROXIDASE	PHI:Null

file2179	4650	EQUIVOCAL (score 1-2+) HER-2 SISH to follow	PHI:Null

file2179	4692	Ki-67 proliferative index: 20%&amp;#160;	PHI:Null

file2179	4732	SEPARATE FINDINGS: Fibrocystic changes	PHI:Null

file2179	4743	&amp;#160;	PHI:Null

file2179	4755	LYMPH NODES	PHI:Null

file2179	4831	SENTINEL LYMPH NODES: Not involved(including 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Read Tsv Dataset

In [None]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files= r"/content/drive/MyDrive/aicup/train/train.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)
print(dataset['train']['label'][:10])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

['IDNUM:09F016547J', 'MEDICALRECORD:091016.NMT', 'PATIENT:SIZAR, HOWARD', 'IDNUM:09F01654', 'STREET:Runford', 'CITY:RENMARK', 'STATE:TAS', 'ZIP:5084', 'PHI:Null', 'DATE:24/8/1993=>1993-08-24']


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-70m"                                                   # character change into token(digit)

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [None]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 8
train_data = list(dataset['train'])
train_dataloader = DataLoader(train_data, batch_size = 8, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer), pin_memory=True) #batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE)
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([8, 23])


[tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,     0,
           5587, 38498,    27, 39218, 50279,  6663,    42,    27, 13141,   209,
          50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278,     0,   399,    15,    48,    15,    35,    27, 50276,
           1348,    16,    25,    16, 13549, 50279, 33762,    27,  1348,    16,
             25,    16, 13549, 14490, 13549,    14,  2904,    14,  1348,   209,
          50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,     0,
          16678,    27, 50276,    46, 50279,  6663,    42,    27, 13141,   209,
    

### DataLoader For training

In [None]:
from transformers import AutoConfig

# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 7
optimizer = AdamW(model.parameters(),lr=1e-4)
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [None]:
from tqdm import tqdm,trange


model_name = "aicup_model"
model_dir = f"/content/drive/MyDrive/aicup/models/{model_name}"

if not os.path.isdir(model_dir):
    os.mkdir(model_dir)
min_loss = 9999

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):

    model.train()
    total_loss = 0
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_Finial.pt'))
    if avg_train_loss < min_loss:
        min_loss = avg_train_loss
        torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Average train loss: 3.3646550675543856


Epoch:  14%|█▍        | 1/7 [17:09<1:42:58, 1029.72s/it]

Average train loss: 3.360902778207648


Epoch:  29%|██▊       | 2/7 [34:09<1:25:18, 1023.76s/it]

Average train loss: 3.360902778207648


Epoch:  43%|████▎     | 3/7 [51:08<1:08:06, 1021.74s/it]

In [None]:
model.load_state_dict(torch.load(os.path.join(model_dir , 'GPT_best.pt')))
model = model.to(device)

def sample_text(model, tokenizer, text, n_words=20):
    model.eval()
    text = tokenizer.encode(text)
    inputs, past_key_values = torch.tensor([text]).to(device), None

    with torch.no_grad():
        for _ in range(n_words):
            out = model(inputs, past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            # if tokenizer.decode(inputs.item()) == '\n':
            #   text.append(tokenizer.encode(eos))
            #   break
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break

    return tokenizer.decode(text)

text = special_tokens_dict['bos_token'] + "D.O.B:  29/9/2000" + special_tokens_dict['sep_token']
print(sample_text(model, tokenizer, text=text , n_words=20))

In [None]:
def process_valid_data(test_txts , out_file):
    with open(out_file , 'w' , encoding = 'utf-8') as fw:
      # t=0
      for txt in test_txts:
          m_report = read_file(txt)
          boundary = 0
          # temp = ''.join(m_report)
          fid = txt.split('/')[-1].replace('.txt' , '')
          for idx,sent in enumerate(m_report):
              if sent.replace(' ' , '').replace('\n' , '').replace('\t' , '') != '':
                  sent = sent.replace('\t' , ' ')
                  # if(t<100):
                  #   t+=1
                  #   print(f"{fid}\t{boundary}\t{sent}\n")

                  fw.write(f"{fid}\t{boundary}\t{sent}\n")
              # else:
              #     print(f"{fid}\t{boundary}\t{sent}\n")
              #     assert 1==2
              boundary += len(sent)

test_phase_path = r'/content/drive/MyDrive/aicup/opendid_test'
valid_out_file_path = '/content/drive/MyDrive/aicup/train/opendid.tsv'
test_txts = list(map(lambda x:os.path.join(test_phase_path , x) , os.listdir(test_phase_path)))
test_txts = sorted(test_txts)
valid_data = process_valid_data(test_txts , valid_out_file_path)

In [None]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files=valid_out_file_path, delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

In [None]:
train_phi_category = ['PATIENT', 'DOCTOR', 'USERNAME',
             'PROFESSION',
             'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
             'AGE',
             'DATE', 'TIME', 'DURATION', 'SET',
             'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDR',
             'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE', 'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']

def get_anno_format(sentence , infos , boundary):
    anno_list = []
    lines = infos.split("\n")
    normalize_keys = ['DATE' , "TIME" , "DURATION" , "SET"]
    phi_dict = {}
    for line in lines:
        parts = line.split(":")
        if parts[0] not in train_phi_category or parts[1] == '':
            continue
        if len(parts) == 2:
            phi_dict[parts[0]] = parts[1].strip()
    for phi_key, phi_value in phi_dict.items():
        normalize_time = None
        if phi_key in normalize_keys:
            if '=>' in phi_value:
                temp_phi_values = phi_value.split('=>')
                phi_value = temp_phi_values[0]
                normalize_time = temp_phi_values[-1]
            else:
                normalize_time = phi_value
        try:
            matches = [(match.start(), match.end()) for match in re.finditer(phi_value, sentence)]
        except:
            continue
        for start, end in matches:
            if start == end:
                continue
            item_dict = {
                        'phi' : phi_key,
                        'st_idx' : start + int(boundary),
                        'ed_idx' : end + int(boundary),
                        'entity' : phi_value,
            }
            if normalize_time is not None:
                item_dict['normalize_time'] = normalize_time
            anno_list.append(item_dict)
    return anno_list

def aicup_predict_new(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = []
    for data in input:
        if 'content' in data and data['content'] is not None:  # 確保 'content' 鍵存在於 data 字典中
            seed = template.replace("__CONTENT__", data['content'])
        else:
            seed = template.replace("__CONTENT__", "")  # 如果 'content' 鍵不存在，使用空字符串
        seeds.append(seed)

    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    #return
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
          if "NULL" in pred:
              continue
          phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
          annotations = get_anno_format(input[idx]['content'] , phi_infos , input[idx]['idx'])

          for annotation in annotations:
              if 'normalize_time' in annotation:
                  outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}\t{annotation["normalize_time"]}')
              else:
                  outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}')

    return outputs

In [None]:
from tqdm.notebook import tqdm

import io
BATCH_SIZE = 32

with open("/content/drive/MyDrive/aicup/answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict_new(model, tokenizer, input=seeds)

            for o in outputs:
              f.write(o)
              print(o)
              f.write('\n')