In [None]:
!pip install -q accelerate==0.20.3
!pip install -q transformers==4.30.0
!pip install -q sentence-transformers==2.2.2

In [None]:
import transformers

print(transformers.__version__)

In [None]:
# standard
import os
import numpy as np
import pandas as pd

# DL
import torch
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import sentence_transformers

# ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

In [None]:
from datasets import load_dataset
dataset = load_dataset("text", data_files = {"train": "/kaggle/input/finetuning-data/finetuning_train.txt", "test": "/kaggle/input/finetuning-data/finetuning_test.txt", "val": "/kaggle/input/finetuning-data/finetuning_val.txt"})

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5TokenizerFast, AutoModelForCausalLM

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
block_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_datasets2 = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
lm_datasets2 = tokenized_datasets2.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
#from transformers import AutoModelForMaskedLM, AutoModelForSeq2SeqLM
#model = AutoModelForMaskedLM.from_pretrained(model_checkpoint2)

In [None]:
model_checkpoint2 = "google/flan-t5-large"

In [None]:
from transformers import Trainer, TrainingArguments

model_name = model_checkpoint2.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-finetuning_final_data-5_epochs.txt",
    evaluation_strategy = "epoch",
    learning_rate=4e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, mlm=False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets2["train"],
    eval_dataset=lm_datasets2["val"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
import pickle

filename = 'flan-t5-large-finetuned-finetuning_final_data-10_epochs.h5'
pickle.dump(model, open(filename, 'wb'))

In [None]:
questions = [
    "Answer this question elaborately: When did the GARDASIL 9 recommendations change?" ,
    "Answer this question elaborately: What were the past 3 recommendation changes for GARDASIL 9?",
    "Answer this question elaborately: Is GARDASIL 9 recommended for Adults?",
    "Answer this question elaborately: Does the ACIP recommend one dose GARDASIL 9?"
]

In [None]:
string1 = "Answer this question elaborately: When did the GARDASIL 9 recommendations change?Given this as context: "
string2 = "Answer this question elaborately: What were the past 3 recommendation changes for GARDASIL 9?Given this context is true: Evidence supporting 9vHPV use was evaluated using  the Grading of Recommendations, Assessment, Development,  and Evaluation (GRADE) framework ( 5) and determined to  be type 2 (moderate level of evidence) among females and 3 (low level of evidence) among males; the recommendation was categorized as a Category A recommendation (for all persons  in an age- or risk-factor–based group) (6), The evidence supporting 9vHPV vaccination was evaluated using the Grading of  Recommendations, Assessment, Development, and Evaluation  (GRADE) framework and determined to be type 2 (moderate level of evidence) among females and 3 (low level of evidence) among males; the recommendation was designated as a  Category A recommendation (recommendation for all persons  in an age- or risk-factor–based group), The main analyses  were restricted to participants who received all 3 doses, had no evidence of current or past infection with the relevant vaccine HPV type through 1 month after the third dose (month 7), and did not deviate from protocol,  What are the new recommendations? 9vHPV, 4vHPV or 2vHPV can be used for routine vaccination of  females aged 11 or 12 years and females through age 26 years who have not been vaccinated previously or who have not  completed the 3-dose series, The GMTs were noninferior for all nine HPV vaccine types in the co-administered group (all p<0"
string3 = "Answer this question elaborately: Is GARDASIL 9 recommended for Adults?Given this context is true: December 10, 2014 Approval letter— GARDASIL 9, These  recommendations for children and adults aged 9 through 26 years and for adults aged >26 years apply to all persons,   For persons initiating vaccination before their 15th birthday, the recommended  immunization schedule is 2 doses of HPV vaccine (0, 6–12 month schedule), Therefore, vaccination  is recommended through the recommended age for females regardless of whether they have an abnormal Pap test result, and for females or males regardless of known HPV infection, HPV-associated precancer lesions, or anogenital warts, Vaccination of males is  recommended with 4vHPV (as long as this formulation is  available) or 9vHPV,   Vaccination of females  is recommended with 2vHPV, 4vHPV (as long as this for-mulation is available), or 9vHPV"
string4 = "Answer this question elaborately: Does the ACIP recommend one dose GARDASIL 9?Given this as context: ACIP did not recommend \n catch-up vaccination for all adults aged 27 through 45 years, \n but recognized that some persons who are not adequately vaccinated might be at risk for new HPV infection and might benefit from vaccination in this age range; therefore, ACIP recommended shared clinical decision-making regarding potential HPV vaccination for these persons, 11During its February 2015 meeting, the Advisory Committee \n on Immunization Practices (ACIP) recommended 9-valent \n human papillomavirus (HPV) vaccine (9vHPV) (Gardasil 9, \n Merck and Co, FDA licensure of quadrivalent human papillomavirus vaccine (HPV4, Gardasil) for use in males and guidance from the Advisory Committee on Immunization Practices (ACIP), Characteristics of the three human papillomavirus (HPV) vaccines licensed for use in the United States \n Characteristic Bivalent (2vHPV)* Quadrivalent (4vHPV)† 9-valent (9vHPV)§\n Brand name Cervarix Gardasil Gardasil 9\n VLPs 16, 18 6, 11, 16, 18 6, 11, 16, 18, 31, 33, 45, 52, 58\n Manufacturer GlaxoSmithKline Merck and Co, December 10, 2014 Approval letter—\n GARDASIL 9"

In [None]:
from transformers import pipeline

text2text_generator = pipeline("text2text-generation", model=model, tokenizer='google/flan-t5-large')

In [None]:
text2text_generator(string4, max_length=50)

In [None]:
for i in questions:
    print(text2text_generator(i))
    print()