In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import accelerate
import peft
import bitsandbytes
import transformers
import trl
import datasets

In [4]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

import warnings
warnings.filterwarnings('ignore')

In [5]:
if torch.cuda.is_available():
    print(torch.cuda.device_count(), torch.cuda.get_device_name(0), torch.cuda.get_device_properties(0).total_memory / 1e9)

2 Tesla T4 15.828320256


In [6]:
dataset = load_dataset("nlpie/Llama2-MedTuned-Instructions")

README.md:   0%|          | 0.00/2.96k [00:00<?, ?B/s]

(…)-00000-of-00001-a8790d88efc2bc45.parquet:   0%|          | 0.00/91.1M [00:00<?, ?B/s]

(…)-00000-of-00001-b543c64b1786c03e.parquet:   0%|          | 0.00/6.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200252 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/70066 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 200252
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 70066
    })
})

In [9]:
# Show the first 3 rows
for i in range(3):
    data = dataset['train'][i]
    print(f"Data Point {i + 1}:")
    print("Instruction >>>", data['instruction'])
    print("Input       >>>", data['input'])
    print("Output      >>>", data['output'])
    print("\n-----------------------------\n")

Data Point 1:
Instruction >>> In your role as a medical professional, address the user's medical questions and concerns.
Input       >>> My relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.
Output      >>> Hi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health.

-----------

In [10]:
# selecting some data to train the model fast
dataset["train"] = dataset["train"].select(range(3500))
dataset["test"]  = dataset["train"].select(range(300))

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 3500
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 70066
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 300
    })
})

In [17]:
# creating the prompt 
def create_prompt(sample):
    prompt = sample["instruction"]
    prompt += " " + sample["input"]
    
    single_turn_prompt = f"""Instruction: {prompt}<|end_of_turn|>AI Assistant: {sample["output"]}"""
    return single_turn_prompt

In [18]:
create_prompt(dataset['train'][0])

"Instruction: In your role as a medical professional, address the user's medical questions and concerns. My relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.<|end_of_turn|>AI Assistant: Hi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health."

In [19]:
create_prompt(dataset['train'][10])

'Instruction: In the clinical text, your objective is to identify relationships between medical problems, treatments, and tests. Medical problems are tagged as @problem$, medical tests as @test$, and treatments as @treatment$. Classify the relationship between two entities as one of the following:\nTreatment improves medical problem (TrIP)\nTreatment worsens medical problem (TrWP)\nTreatment causes medical problem (TrCP)\nTreatment is administered for medical problem (TrAP)\nTreatment is not administered because of medical problem (TrNAP)\nTest reveals medical problem (TeRP)\nTest conducted to investigate medical problem (TeCP)\nMedical problem indicates medical problem (PIP)\nNo Relations Digoxin 0.125 mg q.d. , @treatment$ 80 mg q.a.m. and 40 mg q.p.m. aspirin 1 q.d. , and @treatment$ three puffs b.i.d.<|end_of_turn|>AI Assistant: No Relations'