In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Collecting pip
  Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.2.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.3 MB/s[0m eta [36m0:00:

In [2]:
import json
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with open(r'/content/drive/MyDrive/Badminton ChatBot/badmintondata.json') as f:
  data = json.load(f)

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [6]:
prompt_template = """### Question:
{question}

### Answer:"""

train_datadict = {
    'question': [prompt_template.format(question = ele['question']) for ele in data['train']],
    'answer': [ele['answer'] for ele in data['train']]
}

val_datadict = {
    'question': [prompt_template.format(question = ele['question']) for ele in data['val']],
    'answer': [ele['answer'] for ele in data['val']]
}

train_dataset = Dataset.from_dict(train_datadict)
val_dataset = Dataset.from_dict(val_datadict)

In [7]:
model_name='google/flan-t5-base'
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [8]:
def tokenize_function(example):
    example['input_ids'] = tokenizer(example["question"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["answer"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

train_datasets = train_dataset.map(tokenize_function, batched=True)
val_datasets = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Instruction Finetuned

In [10]:
output_dir = f'/content/drive/MyDrive/Badminton ChatBot/badminton_qa-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=20,
    max_steps=500,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=val_datasets
)

trainer.train()
trainer.save_model(r"/content/drive/MyDrive/Badminton ChatBot/instruction-fined-tuned-flanT5")

results = trainer.evaluate()
print(results)

Step,Training Loss
20,39.275
40,38.0
60,38.2875
80,37.275
100,37.1875
120,37.0375
140,36.45
160,36.275
180,36.5375
200,36.1375


{'eval_loss': 38.122039794921875, 'eval_runtime': 25.0687, 'eval_samples_per_second': 8.417, 'eval_steps_per_second': 2.114, 'epoch': 9.43}


In [11]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Badminton ChatBot/instruction-fined-tuned-flanT5")

In [12]:
questions = [ele['question'] for ele in data['val'][0:10]]
val_ques = val_dataset['question'][0:10]
val_ans = val_dataset['answer'][0:10]
tuned_model_answer = []
for ques in val_ques:
  question_ids = tokenizer(ques,return_tensors = 'pt').input_ids

  answer_pred = finetuned_model.generate(input_ids=question_ids, generation_config=GenerationConfig(max_new_tokens=200))
  answer_pred_text = tokenizer.decode(answer_pred[0], skip_special_tokens=True)
  tuned_model_answer.append(answer_pred_text)

zipped_summaries = list(zip(questions, val_ans, tuned_model_answer))

df = pd.DataFrame(zipped_summaries, columns = ['User Question','actual_answer', 'tuned_model_answer'])
df

Unnamed: 0,User Question,actual_answer,tuned_model_answer
0,Who won the last BWF World Championships in ba...,The winner of the last BWF World Championships...,samuel khan
1,"What is the ""shuttlecock"" in badminton made of?",Shuttlecocks can be made with feathers or plas...,a splinter
2,"What is ""court coverage"" in badminton, and why...",Court coverage means positioning yourself effe...,The court is the only one that can be defended.
3,"Can you explain the ""singles sideline"" in badm...",The singles sideline is the boundary line that...,The singles sideline is when players are paire...
4,"Can you explain the concept of ""followthrough""...",Followthrough is the continuation of your swin...,The following is a list of players who have be...
5,"Tell me about the ""Uber Cup"" in badminton.",The Uber Cup is an international women's team ...,"The ""Uber Cup"" is a tournament played in the U..."
6,Share some strategies for returning a powerful...,"To return a powerful smash, anticipate the sho...","You can use a slam to knock out a shot, but yo..."
7,"What is a ""let rally"" in badminton?",A let rally is a rally that must be replayed d...,a round of ten
8,Share some common badminton etiquette rules.,"In badminton, common etiquette includes shakin...",- Always wear a tennis racket. - Always wear a...
9,Share some tips for maintaining a badminton ra...,"To maintain a racquet, keep it in a protective...","You can use a racquet to play badminton, but y..."


In [13]:
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
val_ques = val_dataset['question']
val_ans = val_dataset['answer']
tuned_model_answer = []
for ques in val_ques:
  question_ids = tokenizer(ques,return_tensors = 'pt').input_ids

  answer_pred = finetuned_model.generate(input_ids=question_ids, generation_config=GenerationConfig(max_new_tokens=200))
  answer_pred_text = tokenizer.decode(answer_pred[0], skip_special_tokens=True)
  tuned_model_answer.append(answer_pred_text)

rouge = evaluate.load('rouge')
tuned_model_results = rouge.compute(
    predictions=tuned_model_answer,
    references=val_ans,
    use_aggregator=True,
    use_stemmer=True,
)
print(tuned_model_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.103752622484373, 'rouge2': 0.01666938112804986, 'rougeL': 0.08958394325660252, 'rougeLsum': 0.08999591352002446}


In [14]:
def inference(question, tokenizer, model):
  prompt_template = """### Question:
  {question}

  ### Answer:"""
  text = prompt_template.format(question = question)
  text_token = tokenizer(text, return_tensors = 'pt').input_ids
  answer_out = model.generate(input_ids=text_token, generation_config=GenerationConfig(max_new_tokens=200))
  answer_text = tokenizer.decode(answer_out[0], skip_special_tokens=True)
  return answer_text


tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Badminton ChatBot/instruction-fined-tuned-flanT5")

In [None]:
!pip install gradio
import gradio as gr

In [17]:
def question_answer(question):
  return inference(question, tokenizer,finetuned_model)
gr.close_all()
demo = gr.Interface(fn=question_answer,
                    inputs=[gr.Textbox(label="Ask anything about Badminton game:", lines=3)],
                    outputs=[gr.Textbox(label="Answer:", lines=3)],
                    title="Badminton Insights"
                   )
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://f4df8d7e9b02a3732f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




PFET

In [18]:
from peft import LoraConfig, get_peft_model, TaskType

In [19]:
lora_config = LoraConfig(
    r=16, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(base_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1769472
all model parameters: 249347328
percentage of trainable model parameters: 0.71%


In [20]:
output_dir = f'/content/drive/MyDrive/Badminton ChatBot/pfet/badminton_qa-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=10,
    max_steps=500,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=val_datasets
)

trainer.train()
trainer.save_model(r"/content/drive/MyDrive/Badminton ChatBot/pfet-fined-tuned-flanT5")

results = trainer.evaluate()
print(results)



Step,Training Loss
10,35.375
20,35.85
30,35.6
40,35.175
50,35.35
60,34.825
70,34.75
80,34.675
90,34.475
100,33.675


{'eval_loss': 30.563980102539062, 'eval_runtime': 26.1904, 'eval_samples_per_second': 8.056, 'eval_steps_per_second': 2.024, 'epoch': 9.43}


In [21]:
pfettuned_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Badminton ChatBot/pfet-fined-tuned-flanT5")

Some weights of the model checkpoint at /content/drive/MyDrive/Badminton ChatBot/pfet-fined-tuned-flanT5 were not used when initializing T5ForConditionalGeneration: ['encoder.block.4.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.0.layer.1.EncDecAttention.q.lora_B.default.weight', 'decoder.block.4.layer.0.SelfAttention.q.lora_A.default.weight', 'encoder.block.2.layer.0.SelfAttention.v.lora_B.default.weight', 'encoder.block.10.layer.0.SelfAttention.v.lora_A.default.weight', 'decoder.block.3.layer.1.EncDecAttention.q.lora_A.default.weight', 'decoder.block.1.layer.1.EncDecAttention.q.lora_B.default.weight', 'encoder.block.2.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.3.layer.0.SelfAttention.v.lora_A.default.weight', 'encoder.block.3.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.10.layer.1.EncDecAttention.v.lora_A.default.weight', 'decoder.block.10.layer.1.EncDecAttention.v.lora_B.default.weight', 'encoder.block.0.layer.0.SelfAttention.v.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
val_ques = val_dataset['question']
val_ans = val_dataset['answer']
tuned_model_answer = []
for ques in val_ques:
  question_ids = tokenizer(ques,return_tensors = 'pt').input_ids

  answer_pred = pfettuned_model.generate(input_ids=question_ids, generation_config=GenerationConfig(max_new_tokens=200))
  answer_pred_text = tokenizer.decode(answer_pred[0], skip_special_tokens=True)
  tuned_model_answer.append(answer_pred_text)

rouge = evaluate.load('rouge')
tuned_model_results = rouge.compute(
    predictions=tuned_model_answer,
    references=val_ans,
    use_aggregator=True,
    use_stemmer=True,
)
print(tuned_model_results)