<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/TineTunning_Testing_For_AviationQADataset_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
#!pip install -U transformers
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet


!pip install huggingface_hub -q

In [None]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

Mounted at /content/gdrive


In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

In [None]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
#model_name = "frankmorales2020/Mistral-7B-Instruct-v0.1_AviationQA" # 07/06/2024

model_name = 'mistralai/Mistral-7B-Instruct-v0.3'

# set device
device = 'cuda'

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

In [None]:
#del model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# Hugging Face model id
model_id = "mistralai/Mistral-7B-Instruct-v0.1" #01 march 2024, 08 MARCH 2024 and 27 MAY 2024

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    #attn_implementation="flash_attention_2", ## BertLMHeadModel does not support Flash Attention 2.0 yet
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    is_decoder=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)
tokenizer.padding_side = 'right' # to prevent warnings

# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

#peft_model_id = "frankmorales2020/Mistral-7B-Instruct-v0.1_AviationQA" # 07/06/2024
#peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2"
peft_model_id = 'mistralai/Mistral-7B-Instruct-v0.3'

# Load Model with PEFT adapter
#model = AutoPeftModelForCausalLM.from_pretrained(

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# load into pipeline
pipe = pipeline("document-question-answering", model=model, tokenizer=tokenizer)

In [67]:
# Convert dataset to OAI messages
system_message = """You are an AVIATION EXPERT. Users will ask you questions in English and you will generate ANSWERS based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["Question"]},
      {"role": "assistant", "content": sample["Answer"]}
    ]
  }


In [None]:
from datasets import load_dataset
print("Preprocessing dataset AviationQA")

dataset = load_dataset("sakharamg/AviationQA")
#dataset = dataset.map(create_prompt_formats)

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation)


# save datasets to disk
dataset["train"].to_json("train_dataset_AviationQA.json", orient="records")
dataset["validation"].to_json("validation_dataset_AviationQA.json", orient="records")
dataset["test"].to_json("test_dataset_AviationQA.json", orient="records")

In [None]:
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="/content/test_dataset_AviationQA.json", split="train")

In [71]:
dataset

Dataset({
    features: ['id', 'Question', 'Answer', 'messages'],
    num_rows: 10807
})

In [72]:
nrec=10
dataset_final_id=dataset['id'][0:nrec]
dataset_final_Question=dataset['Question'][0:nrec]
dataset_final_Answer=dataset['Answer'][0:nrec]
dataset_final_Messages=dataset['messages'][0:nrec]

In [75]:
dataset_final_Messages[1]

[{'content': 'You are an AVIATION EXPERT. Users will ask you questions in English and you will generate ANSWERS based on the provided SCHEMA.\nSCHEMA:\n{schema}',
  'role': 'system'},
 {'content': 'When was the Last Flight Review or Equivalent done of the pilot for the accident no. GAA17CA495?',
  'role': 'user'},
 {'content': 'May 13, 2017', 'role': 'assistant'}]

In [65]:
import pandas as pd

datasetF = pd.DataFrame() # Create an empty DataFrame
datasetF['id'] = dataset_final_id
datasetF['Question'] = dataset_final_Question
datasetF['Answer'] = dataset_final_Answer

In [66]:
datasetF

Unnamed: 0,id,Question,Answer
0,a4905077116,What is the name of the Engine Manufacturer of...,Lycoming
1,a7510929859,When was the Last Flight Review or Equivalent ...,"May 13, 2017"
2,a5430892651,What was the flight conducted under for the ac...,Part 91: General aviation - Personal
3,a9405153070,What was the Observation Time of the accident ...,13:54 Local
4,a8193000066,What is the name of the Engine Manufacturer of...,LYCOMING
5,a9610719319,What is the Airworthiness Certificate of the a...,Normal
6,a7001234003,Was there any Operating Certificate(s) held of...,
7,a9135502144,How many Seats are there in the aircraft beari...,2
8,a157434574,What is the Serial Number of the aircraft bear...,07-12-03
9,a1523521076,What is the Airframe Total Time of the acciden...,10663 Hrs as of last inspection


In [76]:
from datasets import load_dataset
eval_dataset = load_dataset("json", data_files="/content/test_dataset_AviationQA.json", split="train")

In [77]:
eval_dataset

Dataset({
    features: ['id', 'Question', 'Answer', 'messages'],
    num_rows: 10807
})

In [None]:
eval_dataset[0]["messages"][:2]

In [80]:
from transformers import pipeline

#qa_pipeline = pipeline("question-answering")
#qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

#generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
eval_dataset[2]['Question']

'What was the flight conducted under for the accident no. GAA18CA423?'

In [57]:
eval_dataset[2]['Answer']

'Part 91: General aviation - Personal'

https://nbviewer.org/github/frank-morales2020/MLxDL/blob/main/upload_model_hf.ipynb

In [None]:
eval_dataset[rand_idx]["messages"][0]["content"]

In [158]:
rand_idx = randint(0, len(eval_dataset))

In [164]:
#rand_idx = randint(0, len(eval_dataset))
modified_conversation = [
    {"role": "user", "content": eval_dataset[rand_idx]["messages"][:3][1]['content']},
    {"role": "assistant", "content": eval_dataset[rand_idx]["messages"][:3][2]['content']}
]

In [165]:
modified_conversation

[{'role': 'user',
  'content': 'What kind of Medical Certification were done by the pilot for the accident no. ERA10CA469?'},
 {'role': 'assistant', 'content': 'Class 2 With waivers/limitations'}]

In [173]:
from datasets import load_dataset
from random import randint

#model.generation_config.pad_token_id = tokenizer.pad_token_id

# Inspect the structure of the messages
#print(eval_dataset[rand_idx]["messages"][:2])

# Modify the conversation to ensure alternating roles if needed
# For example, if the first two messages are both user messages:
modified_conversation = [
    {"role": "user", "content": eval_dataset[rand_idx]["messages"][:3][1]['content']},
    {"role": "assistant", "content": eval_dataset[rand_idx]["messages"][:3][2]['content']}
]

#print(modified_conversation)

#Use the chat template to format the input for the language model
prompt = pipe.tokenizer.apply_chat_template(modified_conversation, tokenize=False, add_generation_prompt=True)


# Replace 'text-generation' with the appropriate task for your model
# For example, if you're using a causal language model:
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Pass the prompt to the generation pipeline
outputs = generation_pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.9,
                              top_k=50, top_p=0.1, eos_token_id=tokenizer.eos_token_id,
                              pad_token_id=tokenizer.eos_token_id)

In [174]:
outputs

[{'generated_text': "<s>[INST] What kind of Medical Certification were done by the pilot for the accident no. ERA10CA469? [/INST]Class 2 With waivers/limitations</s> The accident you're referring to is ERA10CA469, which occurred on October 10, 2010, involving an Embraer ERJ-190 operated by Era Aviation. To find out the specific medical certifications held by the pilot, I would need to access the official accident investigation report, which is typically published by the National Transportation Safety Board (NTSB) in the United States.\n\nHowever, I can tell you that for commercial airline pilots in the United States, the minimum medical certification required is a First Class Medical Certificate. This certification is issued by an Aviation Medical Examiner (AME) and requires a physical examination to ensure the pilot is in good health and meets the physical and mental requirements for flying.\n\nIn some cases, pilots may have waivers or limitations on their medical certificates due to 

In [175]:
#rand_idx = randint(0, len(eval_dataset))
print(f"Question:\n{eval_dataset[rand_idx]['Question']}")
print()
print(f"Original Answer:\n{eval_dataset[rand_idx]['Answer']}")
print()
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Question:
What kind of Medical Certification were done by the pilot for the accident no. ERA10CA469?

Original Answer:
Class 2 With waivers/limitations

Generated Answer:
The accident you're referring to is ERA10CA469, which occurred on October 10, 2010, involving an Embraer ERJ-190 operated by Era Aviation. To find out the specific medical certifications held by the pilot, I would need to access the official accident investigation report, which is typically published by the National Transportation Safety Board (NTSB) in the United States.

However, I can tell you that for commercial airline pilots in the United States, the minimum medical certification required is a First Class Medical Certificate. This certification is issued by an Aviation Medical Examiner (AME) and requires a physical examination to ensure the pilot is in good health and meets the physical and mental requirements for flying.

In some cases, pilots may have waivers or limitations on their medical certificates due to