### Fine tuning gpt-4o-mini for AI Tuotor creation

In [None]:
!pip -q install langchain-core
!pip -q install langchain-community
!pip -q install langchain_huggingface
!pip -q install langchain_chroma
!pip -q install PyPDF2
!pip -q install transformers
!pip -q install datasets
!pip -q install -U accelerate bitsandbytes peft trl

In [None]:
!pip -q install jsonlines

In [None]:
!pip install tiktoken



In [None]:
# Authentication for Huggingface API

import os
from getpass import getpass

hfapi_key = getpass("Enter you HuggingFace access token:")
os.environ["HF_TOKEN"] = hfapi_key
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hfapi_key

Enter you HuggingFace access token:··········


In [None]:
from collections import defaultdict
format_errors = defaultdict(int)

def validate_dataset(output_data):

  for ex in output_data:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("\nNo errors found in the Formatted dataset \n")

In [None]:
import tiktoken

def counting_no_tokens(output_data):
  tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
  total_tokens = sum(len(tokenizer.encode(" ".join(message['content'] for message in entry['messages']))) for entry in output_data)
  print(f"Total number of tokens in the Dataset: {total_tokens} \n")

In [None]:
from huggingface_hub import hf_hub_download
import json
import jsonlines
from pprint import pprint

def dataset_preparation(file_name):
    file_path = hf_hub_download(
        repo_id="jaiganesan/GPT_4o_mini_Fine_tune",
        filename=file_name,
        repo_type="dataset",
        local_dir="/content"
    )

    with open(file_path, "r") as file:
        data = [json.loads(line) for line in file]

    print("Total entries in the dataset:", len(data))
    print("-_"*30)
    print(data[4])

    output_data = []

    for entry in data:
        formatted_entry = {
            "messages": [
                {"role": "system", "content": "As AI Tutor, answer questions related to AI topics in an in-depth and factual manner."},
                {"role": "user", "content": entry['question']},
                {"role": "assistant", "content": entry['answer']}
            ]
        }
        output_data.append(formatted_entry)

    # Validate and analyze the output data
    validate_dataset(output_data)
    counting_no_tokens(output_data)

    print("-_"*30)
    print(output_data[4])

    base_file_name = os.path.splitext(file_name)[0]
    output_file_path = f'formatted_{base_file_name}.jsonl'

    with jsonlines.open(output_file_path, mode='w') as writer:
        writer.write_all(output_data)

    print(f"\nFormatted dataset has been saved to {output_file_path}.")

In [None]:
# Training Dataset
dataset_preparation("question_answers_data_100.jsonl")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Total entries in the dataset: 100
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'tai_blog', 'question': 'What are the key advantages of using BiFPN in object detection compared to conventional methods?', 'answer': "BiFPN, or Bi-directional Feature Pyramid Network, offers several advantages in object detection when compared to conventional methods. It's part of the EfficientDet family of object detectors developed by Google Research and is designed to enhance the efficiency and scalability of object detection models.\n\n### Key Advantages of BiFPN:\n\n1. **Weighted Feature Fusion:**\n   Unlike conventional methods that simply sum up input features during feature fusion, BiFPN introduces learnable weights to adjust the importance of different input features. This means that during multi-scale fusion, input features are not merely combined indiscriminately but are weighted according to their relevance, which enhances the accuracy of the fusion process.\n\n2. **Bi

In [None]:
# Evaluation Dataset
dataset_preparation("question_answers_data_30.jsonl")

Total entries in the dataset: 30
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'openai_cookbooks', 'question': 'How can creating high-quality evaluations for large language models like GPT-4 improve the stability and reliability of AI applications?', 'answer': "Creating high-quality evaluations for large language models (LLMs), like GPT-4, significantly enhances the stability and reliability of AI applications. Evaluations serve as a robust mechanism to monitor and assess how well these models perform across various scenarios, ultimately leading to improvements in model robustness and reliability.\n\nFirstly, high-quality evaluations can help identify and address areas where models may be underperforming. For instance, systematic evaluations can uncover issues such as drifting performance or deteriorating accuracy over time. By regularly evaluating LLMs against a comprehensive set of benchmarks, developers can detect and correct potential degradation in model 

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

checkpoint = "openai-community/gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)
# set pad_token_id to unk_token_id
tokenizer.pad_token = tokenizer.unk_token

model = GPT2LMHeadModel.from_pretrained(checkpoint)

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
tokenizer

GPT2Tokenizer(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [None]:
from datasets import load_dataset

train_file_name = 'question_answers_data_100.jsonl'
val_file_name = 'question_answers_data_30.jsonl'

dataset = load_dataset("json", data_files={"train": train_file_name,
                                           "validation": val_file_name})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'question', 'answer'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['source', 'question', 'answer'],
        num_rows: 30
    })
})

In [None]:
# Function to merge the columns
def merge_columns(example):
    example['text'] = f"{example['source']} {example['question']} {example['answer']}"
    # Remove the original columns
    del example['source']
    del example['question']
    del example['answer']
    return example

# Apply the function to both train and validation datasets
dataset['train'] = dataset['train'].map(merge_columns)
dataset['validation'] = dataset['validation'].map(merge_columns)

# Now 'mytext' will be part of your datasets
print(dataset['train'][0])  # Check the first entry

{'text': "tai_blog What are the benefits of using debate among AI agents to improve their learning process compared to traditional feedback methods? Debate among AI agents introduces a novel and compelling method for improving the learning process by applying concepts from game theory, specifically zero-sum games, where agents are pit against each other to argue a point, with a human judge determining the winner. This approach offers several benefits:\n\n1. **Enhanced Feedback Quality**: In a debate, adversarial relationships are created, forcing AI agents to constructively criticize each other's arguments. This process improves the feedback provided by human judges, as each agent attempts to present the most truthful and useful information to win the debate. The pressure to defend a position against criticism naturally elevates the quality and truthfulness of information exchanged.\n\n2. **Overcoming Human Feedback Limitations**: In traditional AI learning environments, continuous hum

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 30
    })
})

In [None]:
dataset['train']['text'][0]

"tai_blog What are the benefits of using debate among AI agents to improve their learning process compared to traditional feedback methods? Debate among AI agents introduces a novel and compelling method for improving the learning process by applying concepts from game theory, specifically zero-sum games, where agents are pit against each other to argue a point, with a human judge determining the winner. This approach offers several benefits:\n\n1. **Enhanced Feedback Quality**: In a debate, adversarial relationships are created, forcing AI agents to constructively criticize each other's arguments. This process improves the feedback provided by human judges, as each agent attempts to present the most truthful and useful information to win the debate. The pressure to defend a position against criticism naturally elevates the quality and truthfulness of information exchanged.\n\n2. **Overcoming Human Feedback Limitations**: In traditional AI learning environments, continuous human feedba

In [None]:
dataset['validation']['text'][0]

'langchain What types of file formats can be processed using Azure AI Document Intelligence? Azure AI Document Intelligence can process a variety of file formats. Supported file formats include:\n\n- **PDF**: Portable Document Format files.\n- **JPEG/JPG**: Image files in the JPEG format.\n- **PNG**: Image files in the Portable Network Graphics format.\n- **BMP**: Bitmap image files.\n- **TIFF**: Tagged Image File Format files, often used in professional photography and publishing.\n- **HEIF**: High-Efficiency Image File format, commonly used in modern digital photography.\n- **DOCX**: Microsoft Word document files.\n- **XLSX**: Microsoft Excel spreadsheet files.\n- **PPTX**: Microsoft PowerPoint presentation files.\n- **HTML**: Hypertext Markup Language files, used for web pages.\n\nThis broad range of supported formats allows Azure AI Document Intelligence to handle both textual and graphical content, making it versatile for various document types, whether scanned or digital. If you 

In [None]:
block_size = 256     # max tokens in an input sampleHuggingFace

def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=block_size, return_tensors='pt')

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 30
    })
})

In [None]:
# Add the 'label' feature to the tokenized_datasets
def add_label_feature(example):
    example['labels'] = example['input_ids']
    return example

tokenized_datasets = tokenized_datasets.map(add_label_feature)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30
    })
})

In [None]:
# Set up the training arguments
from transformers import Trainer, TrainingArguments
model_output_path = "/content/tutor_model"

training_args = TrainingArguments(
    output_dir = model_output_path,
    overwrite_output_dir = True,
    per_device_train_batch_size = 4, # try with 2
    per_device_eval_batch_size = 4,  #  try with 2
    num_train_epochs = 10,
    save_steps = 1_000,
    save_total_limit = 2,
    logging_dir = './logs',
)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [None]:
# Train the model
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"]
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchsubhasis[0m ([33mchsubhasis-enphase-energy[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=250, training_loss=2.20593701171875, metrics={'train_runtime': 94.5875, 'train_samples_per_second': 10.572, 'train_steps_per_second': 2.643, 'total_flos': 130646016000000.0, 'train_loss': 2.20593701171875, 'epoch': 10.0})

In [None]:
# Save the model
saved_model_path = "/content/finetuned_aitutor_model"
trainer.save_model(saved_model_path)

# Save the tokenizer
tokenizer.save_pretrained(saved_model_path)

('/content/finetuned_aitutor_model/tokenizer_config.json',
 '/content/finetuned_aitutor_model/special_tokens_map.json',
 '/content/finetuned_aitutor_model/vocab.json',
 '/content/finetuned_aitutor_model/merges.txt',
 '/content/finetuned_aitutor_model/added_tokens.json')

In [None]:
#Now reuse this new model

my_model_finetuned = GPT2LMHeadModel.from_pretrained(saved_model_path)
my_tokenizer_finetuned = GPT2Tokenizer.from_pretrained(saved_model_path)

In [None]:
import torch
def generate_response(model, tokenizer, prompt, max_length=200):

    input_ids = tokenizer.encode(prompt, return_tensors="pt")      # 'pt' for returning pytorch tensor

    # Check the device of the model
    device = next(model.parameters()).device

    # Move input_ids to the same device as the model
    input_ids = input_ids.to(device)

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
prompt = "What is Artifician Intelligence?"
response = generate_response(my_model_finetuned, my_tokenizer_finetuned, prompt)
print("Generated response:")
response

Generated response:


'What is Artifician Intelligence? Artifician intelligence is a measure of the ability of a mathematician to grasp the complexities of mathematical problems and applications without resorting to traditional methods. It is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods.\n\n### What Is Artifician Intelligence? Artifician intelligence is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods. It is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods.\n\n### What Is Its Benefits? Artifician intelligence is crucial for understanding the intricate mathematical structures of mathematical applications. It helps in understanding the relationships between mathematical operations and data structure

In [None]:
#Push your fine-tuned model to HuggingFace Model Hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 
Token is valid (permission: fineGr

In [None]:
# Push model
my_repo = "ai-tutor-towardsai"
my_model_finetuned.push_to_hub(repo_id= my_repo, commit_message= "Upload fine-tuned model")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/chsubhasis/ai-tutor-towardsai/commit/2e60190e12d8fc4159456e18aae6965519c1135e', commit_message='Upload fine-tuned model', commit_description='', oid='2e60190e12d8fc4159456e18aae6965519c1135e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/chsubhasis/ai-tutor-towardsai', endpoint='https://huggingface.co', repo_type='model', repo_id='chsubhasis/ai-tutor-towardsai'), pr_revision=None, pr_num=None)

In [None]:
# Push tokenizer
my_tokenizer_finetuned.push_to_hub(repo_id= my_repo, commit_message= "Upload tokenizer used")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/chsubhasis/ai-tutor-towardsai/commit/67f2981cd2c5771359d5f49381d95b49ac0c8975', commit_message='Upload tokenizer used', commit_description='', oid='67f2981cd2c5771359d5f49381d95b49ac0c8975', pr_url=None, repo_url=RepoUrl('https://huggingface.co/chsubhasis/ai-tutor-towardsai', endpoint='https://huggingface.co', repo_type='model', repo_id='chsubhasis/ai-tutor-towardsai'), pr_revision=None, pr_num=None)

In [None]:
#Load the model and tokenizer back from Hub and test it with user input prompts
from transformers import AutoModelWithLMHead, AutoTokenizer

my_checkpoint = "chsubhasis/ai-tutor-towardsai"
loaded_model = AutoModelWithLMHead.from_pretrained(my_checkpoint)
loaded_tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)



config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

In [None]:
prompt = "What is Artifician Intelligence?"           # Replace with your desired prompt
response = generate_response(loaded_model, loaded_tokenizer, prompt)
print("Generated response:")
response

Generated response:


'What is Artifician Intelligence? Artifician intelligence is a measure of the ability of a mathematician to grasp the complexities of mathematical problems and applications without resorting to traditional methods. It is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods.\n\n### What Is Artifician Intelligence? Artifician intelligence is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods. It is defined as the ability to grasp the intricate mathematical structures of mathematical problems and applications without resorting to traditional methods.\n\n### What Is Its Benefits? Artifician intelligence is crucial for understanding the intricate mathematical structures of mathematical applications. It helps in understanding the relationships between mathematical operations and data structure