In [None]:
from pathlib import Path
import json
import re
import os


import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig

from peft import PeftModel, PeftConfig

import torch

from tqdm.auto import tqdm

import networkx as nx
from networkx.algorithms.traversal.depth_first_search import dfs_tree

from datasets import load_dataset, Dataset

In [None]:
BASE_DIR = Path("/home/informatics/pdevkota")
DATA_DIR = Path.joinpath(BASE_DIR, "data")
DATASET_DIR = Path.joinpath(DATA_DIR, "model_input", "dataset")
FALCON_MODEL = "tiiuae/falcon-7b-instruct"

MODEL_DIR = Path.joinpath(Path("./MODELS"), Path(FALCON_MODEL.upper().replace("-", "_")).stem) #"MODELS/FALCON_7B_INSTRUCT"
model_name = str(MODEL_DIR)
device = "cuda:0"

In [None]:
config = PeftConfig.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map=device,
    trust_remote_code=True,
)

In [None]:
model = PeftModel.from_pretrained(model, model_name)

In [None]:
data_files = [i for i in DATASET_DIR.iterdir() if i.suffix==".json" and "v4" in str(i)]
train_file = [i for i in data_files if "train" in str(i)][0]
eval_file = [i for i in data_files if "test" in str(i)][0]
dataset = load_dataset(
    "json",
    data_files={
        "train": str(train_file),
        "eval": str(eval_file)
    })

In [None]:
def annotstrflist(example):
    data = "terms: none\nGO concepts: none\nparents: none"
    if len(example["annot"]):
        terms, concepts, parents = [], [], []
        for j in example["annot"]:
            terms.append(j["spanned_text"])
            concepts.append(j["go_concept"])
            parent = [k["GO Concept"] for k in j["parents"]]
            if len(parent) == 0:
                parent= "none"
            elif len(parent) == 1:
                parent = parent[0]
            else:
                parent = f"[{' | '.join(parent)}]"
            parents.append(parent)
        assert len(terms) == len(concepts) == len(parents)
        terms = " | ".join(terms)
        concepts = " | ".join(concepts)
        parents = " | ".join(parents)
        data = f"terms: [{terms}]\nGO concepts: [{concepts}]\nparents: [{parents}]"
    return data

In [None]:
def generate_prompt_no_response(example):
    pre_prompt = """Gene Ontology (GO) is a widely used bioinformatics resource that provides a structured
    vocabulary for annotating and categorizing genes and gene products based on their biological functions,
    cellular locations, and molecular activities. You are a gene ontology expert and your objective is to use
    your knowledge of the biological domain and the details provided below to write a response that appropriately
    completes the instruction."""
    pre_prompt = re.sub(r"\s+", " ", pre_prompt)
    
    instruction = """Use the input sentence below to label the tokens: terms, GO concepts and parents.
    A term is a word or a phrase (phrase is a sequence of words) that represents a GO concept. Each term
    MUST be present in the provided input sentence. A GO concept refers to a specific term or category with
    GO hierarchy. Each GO concept can have zero or more parents. A parent represents immediate predecessor
    of a GO concept. The response SHOULD have equal number of terms, GO concepts and parents."""
    instruction = re.sub(r"\s+", " ", instruction)
    inp = example["pre"]
    prompt = f"{pre_prompt}\n\n### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n"
    output = annotstrflist(example)
    response = prompt + output
    encoded_full_prompt = tokenizer(prompt, return_tensors="pt")
    encoded_full_prompt_and_response = tokenizer(response)
    len_with_response = len(encoded_full_prompt_and_response["input_ids"])
    return {**encoded_full_prompt, "output": output, "len_with_response": len_with_response}

In [None]:
new_dataset = dataset.map(generate_prompt_no_response, num_proc=os.cpu_count())
new_dataset = new_dataset.filter(lambda x: x["len_with_response"] < 400, num_proc=os.cpu_count())
new_dataset

In [None]:
gen_config = GenerationConfig(
    temperature=0.01,
    top_k=0.85,
    top_n=5,
    num_beams=3
)

In [None]:
output_path = Path.joinpath(Path("."), "predictions", MODEL_DIR.stem, f"outputs_{int(device[-1])+1}.json")
Path.mkdir(output_path.parent, exist_ok=True, parents=True)

part = int(device[-1]) + 1
split_size = new_dataset["eval"].num_rows // 3 + 1
reqd_dataset = Dataset.from_dict(new_dataset["eval"][split_size * (part - 1) : split_size * part])
print("Dataset Loaded from {0}: {1}".format(
    split_size * (part - 1), split_size * part)
)

In [None]:
def save_to_json(data, filename):
    if not Path(str(filename)).exists():
        contents = []
    else:
        with open(filename, "r") as f:
            contents = json.load(f)
    contents.extend(data)
    with open(filename, "w") as f:
        json.dump(contents, f)

In [None]:
def minimal_prompt(example):
    pre_prompt = """You are a gene ontology expert and your objective is to use
    your knowledge of the biological domain and the details provided below to write a response that appropriately
    completes the instruction."""
    pre_prompt = re.sub(r"\s+", " ", pre_prompt)
    instruction = """Use the input sentence below to label the tokens: terms, GO concepts and parents.
    A term is a word or a phrase (phrase is a sequence of words) that represents a GO concept. Each term
    MUST be present in the provided input sentence. A GO concept refers to a specific term or category with
    GO hierarchy. Each GO concept can have zero or more parents. A parent represents immediate predecessor
    of a GO concept. The response SHOULD have equal number of terms, GO concepts and parents."""
    instruction = re.sub(r"\s+", " ", instruction)
    inp = example["pre"]
    prompt = f"{pre_prompt}\n\n### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n"
    encoded_full_prompt = tokenizer(prompt, return_tensors="pt")
    return encoded_full_prompt

In [None]:
model.eval()
with torch.no_grad():
    pbar = tqdm(total=reqd_dataset.num_rows, desc="Generating responses from given prompts")
    checkpoint_idx = 0
    if output_path.exists():
        with open(output_path, "r") as f:
            contents = json.load(f)
        checkpoint_idx = len(contents)
        pbar.update(checkpoint_idx)
    chunk_data = []
    for idx in range(checkpoint_idx, reqd_dataset.num_rows):
        input_ids = minimal_prompt(reqd_dataset[idx])["input_ids"]
        output = model.generate(
            input_ids=input_ids.to(device),
            generation_config=gen_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=100,
        )
        response = tokenizer.decode(output.sequences[0]).strip().split("### Response:\n")[1].split("#")[0].strip()
        data = reqd_dataset[idx].copy()
        [data.pop(i) for i in ["input_ids", "attention_mask", "len_with_response", "token_type_ids"]]
        data.update({"response": response})
        chunk_data.append(data)
        if (idx + 1) % 1 == 0:
            save_to_json(chunk_data, output_path)
            chunk_data = []
        pbar.update(1)
    pbar.close()