# Upload & Unzip txt files (Run only if on Google Drive)

On first pass, use the following four blocks of code to upload the relevant text files as a zipped folder to drive, and unzip them in the notebook.

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip ./drive/MyDrive/Capstone/clean_ocr_txt.zip -d "/content/drive/My Drive/Capstone/"

In [None]:
!rm -rf "/content/drive/My Drive/Capstone/_MACOSX" "/content/drive/My Drive/Capstone/clean_ocr_txt/.DS_Store/"

# Setup Targets and Read Txt Docs

On future notebook use, run from here.

Read in UUIDs and targets for annotated data.

In [None]:
import pandas as pd
import numpy as np

# base_filepath = "/content/drive/My Drive/Capstone/"
base_filepath = "./"
df = pd.read_csv(f'{base_filepath}assets/100annotations.csv')

df = df.drop(df.columns[0], axis=1)

targets = list(df.columns)[2:]
annotated_uuids = df.iloc[:, 0].unique().tolist()
print(f"Targets: {targets}")
print(f"Working with data for {len(annotated_uuids)} annotated uuids")

Read in the cleaned OCR .txt files (only those that have targets).

In [None]:
import os
from tqdm import tqdm, trange

ocr_txt_path = f"{base_filepath}data/clean_ocr_txt/"

cleaned_docs = {}
for root, dirs, files in os.walk(ocr_txt_path):
    for name in files:
        if name.endswith('.txt'):
            uuid = name.rstrip(".txt")
            if uuid not in annotated_uuids:
              continue

            file_path = os.path.join(ocr_txt_path, name)
            with open(file_path, 'r') as file:
              cleaned_docs[uuid] = file.read()

In [None]:
print(f"We have cleaned ocr documents for {len(list(cleaned_docs.keys()))} / {len(annotated_uuids)} of these uuids.")

In [None]:
# cleaned_docs

# Model





### Dependencies

Install dependencies for HuggingFace.

If notebook fails to run after pip installs, restart the session.

In [None]:
!pip install transformers
!pip install peft
!pip install sentencepiece
!pip install bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate

### Import and Load Model

Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model
import torch
import time

Specify model

In [None]:
model_name="teknium/OpenHermes-2.5-Mistral-7B"

Quantization config

In [None]:
bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

Finetune config

In [None]:
peft_config=LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj','gate_proj','v_proj','up_proj','q_proj','o_proj','down_proj']
)

Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             )


In [None]:
model.config.use_cache=False
model=get_peft_model(model, peft_config)

Tokenizer

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_name)

Generation config

In [None]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.9,
    max_new_tokens=1024,
    top_p=0.95,
    top_k=10,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id
)

### Extraction Helper Functions

Helper functions for creating a spellchecking prompt and splitting the documents into chunks.

In [None]:
def extraction_prompt(text, targets):
    prompt = [
                {
                    "role": "system",
                    "content": "You are in charge of information extraction from medical pathology reports. Accuracy is imperative.",
                },
                {
                    "role": "user",
                    "content": f"Please return the following following variables from the document, {targets} please make sure you are accurate, and include all occurences. Here is the document to extract from: {text}"
                },
              ]

    return prompt

In [None]:
!pwd

### Extraction loop

Make sure to set up file structure as described in `README.md`

In [None]:
extracted_filepath = f"{base_filepath}/data/extracted_txt/"
os.makedirs(extracted_filepath, exist_ok=True)

extracted_docs = {}

for uuid in tqdm(list(cleaned_docs)[:1]):
  print(f"Processing UUID: {uuid}")
  t0 = time.time()
  old_prompt = extraction_prompt(cleaned_docs[uuid], targets)
  with open(f"{base_filepath}assets/prompt1.txt", 'r') as file:
      new_prompt = file.read()
  new_prompt += cleaned_docs[uuid]
  print(new_prompt)
  print("="*80)
  print(old_prompt)
  assert old_prompt == new_prompt, "prompts are not the same, check!"
  tokenized_chat = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
  outputs = model.generate(input_ids=tokenized_chat, generation_config=generation_config)
  decoded = tokenizer.decode(outputs[0])
  del outputs
  output = decoded.split("<|im_start|> assistant\n")[-1]
  # print("\ndecoded output after splitting")
  # print(output)
  extracted_docs[uuid] = output

  torch.cuda.empty_cache()

  # Save output to file
  file_path = extracted_filepath + uuid + ".txt"
  try:
    with open(file_path, 'w') as file:
      file.write(output)
  except Exception as error:
    print(error)
    print("Writing to file failed.\n")
  del output
  print(f"Extraction completed in {round(time.time() - t0,2)} seconds\n")


print("\n----------------------------------")
print("       PROCESSING COMPLETE")
print("----------------------------------")

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

### Prompt Engineeering
We first modify the current setup to enable storage of prompts and to track which prompts were ueed for which extractions.

In [None]:
a = "{targets}_test"
a = a.replace("{targets}", str(targets))
a

In [None]:
def load_prompt(prompt_number, text, targets):
  """Given a prompt number, loads the corresponding prompt from assets/prompts/.
  Returns a list of dicts formatted to be used in the extraction step as was done above.

  prompt_number (int): which prompt to laod from
  text (str): the full text of the document from which we are extracting
  target (list): the target variables to extract

  """

  with open(f"{base_filepath}assets/prompts/prompt{prompt_number}.txt", 'r') as file:
      prompt_content = file.read()
  prompt_content = prompt_content.replace("{targets}", str(targets))
  prompt_content = prompt_content.replace("{text}", text)

  prompt = [
                {
                    "role": "system",
                    "content": "You are in charge of information extraction from medical pathology reports. Accuracy is imperative.",
                },
                {
                    "role": "user",
                    "content": prompt_content
                },
              ]
  return prompt
load_prompt(1, "sample", targets)

In [None]:
extracted_filepath = f"{base_filepath}/data/extracted_txt/"
prompt_number = 1
os.makedirs(extracted_filepath, exist_ok=True)

extracted_docs = {}

for i, uuid in enumerate(tqdm(list(cleaned_docs))):
  print(f"Processing UUID: {uuid}")
  if i == 66 or i == 88: # these cause an out of memory error (presumably the files are too large)
    continue
  t0 = time.time()
  prompt = load_prompt(1, cleaned_docs[uuid], targets)
  tokenized_chat = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
  outputs = model.generate(input_ids=tokenized_chat, generation_config=generation_config)
  decoded = tokenizer.decode(outputs[0])
  del outputs
  output = decoded.split("<|im_start|> assistant\n")[-1]
  # print("\ndecoded output after splitting")
  # print(output)
  extracted_docs[uuid] = output

  torch.cuda.empty_cache()

  # Save output to file
  file_path = f"{extracted_filepath}prompt{prompt_number}/prompt{prompt_number}_" + uuid + ".txt"
  try:
    with open(file_path, 'w') as file:
      file.write(output)
  except Exception as error:
    print(error)
    print("Writing to file failed.\n")
  del output
  print(f"Extraction completed in {round(time.time() - t0,2)} seconds\n")


print("\n----------------------------------")
print("       PROCESSING COMPLETE")
print("----------------------------------")