In [None]:
!pip -q install langchain-core
!pip -q install langchain-community
!pip -q install langchain_huggingface
#!pip -q install langchain_chroma
!pip -q install PyPDF2
!pip -q install transformers
!pip -q install datasets
!pip -q install -U accelerate bitsandbytes peft trl
!pip -q install jsonlines
!pip install tiktoken

In [None]:
import os
import re
import PyPDF2
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, DataCollatorForLanguageModeling#
#from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Authentication for Huggingface API

import os
from getpass import getpass

hfapi_key = getpass("Enter you HuggingFace access token:")
os.environ["HF_TOKEN"] = hfapi_key
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hfapi_key

In [None]:
# Function to read document pdf files
def read_pdf(pdf_path):
    text = ""

    # Open the PDF file
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)

        # Iterate over each page
        for page_num in range(len(reader.pages)):
            if page_num > 3:                         # extract text starting from page 5
                page = reader.pages[page_num]
                text += page.extract_text()

    return text


In [None]:
# Read files/documents

pdf_path = 'AIML.pdf'
text_file = read_pdf(pdf_path)

In [None]:
#print(text_file[:8000])

In [None]:
# Remove excess newline characters
text_file = re.sub(r'\n+', '\n', text_file).strip()

# Remove excess spaces
text_file = re.sub(r' +', ' ', text_file).strip()

# Remove unnecessary words (Header & Page number)
text_file = re.sub(r' \d+ International Gita Society', '', text_file)
text_file = re.sub(r' Bhagavad -Gita \d+', '', text_file)

In [None]:
#print(text_file[:8000])

In [None]:
#Keep 100 words per line inside text
word_list = []
new_text_file = ''

for line in text_file.split('\n'):
    words = line.split()
    for word in words:
        word_list.append(word)
        if len(word_list) == 100:
            new_text_file += ' '.join(word_list) + '\n'
            word_list = []

if word_list:
    new_text_file += ' '.join(word_list) + '\n'

In [None]:
#print(new_text_file[:8000])

In [None]:
#len(new_text_file.split('\n')[0].split())

In [None]:
# Split the text into training and validation sets

train_fraction = 0.8
split_index = int(train_fraction * len(new_text_file))

train_text = new_text_file[:split_index]
val_text = new_text_file[split_index:]

In [None]:
#len(train_text)

In [None]:
# Save the training and validation data as text files

with open("train.txt", "w") as f:
    f.write(train_text)

with open("val.txt", "w") as f:
    f.write(val_text)

In [None]:
# Set up the tokenizer
checkpoint = "openai-community/gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)    # also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl

# set pad_token_id to unk_token_id
tokenizer.pad_token = tokenizer.unk_token

In [None]:
# Tokenize sample text using GP2Tokenizer
sample_ids = tokenizer("Hello world")
sample_ids

In [None]:
# Generate tokens for sample text
sample_tokens = tokenizer.convert_ids_to_tokens(sample_ids['input_ids'])
sample_tokens

In [None]:
# Generate original text back
tokenizer.convert_tokens_to_string(sample_tokens)

In [None]:
train_file_path = 'train.txt'
val_file_path = 'val.txt'

dataset = load_dataset("text", data_files={"train": train_file_path,
                                           "validation": val_file_path})

In [None]:
#dataset

In [None]:
#dataset['train']['text'][0]

In [None]:
block_size = 256     # max tokens in an input sampleHuggingFace

def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=block_size, return_tensors='pt')

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
#len(tokenized_datasets['train']['input_ids'][0])

In [None]:
tokenizer.decode(tokenized_datasets['train']['input_ids'][1])

In [None]:
# Create a Data collator object
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [None]:
# Set up the model
model = GPT2LMHeadModel.from_pretrained(checkpoint)

In [None]:
# Set up the training arguments

model_output_path = "/content/tutor_model"

training_args = TrainingArguments(
    output_dir = model_output_path,
    overwrite_output_dir = True,
    per_device_train_batch_size = 4, # try with 2
    per_device_eval_batch_size = 4,  #  try with 2
    num_train_epochs = 100,
    save_steps = 1_000,
    save_total_limit = 2,
    logging_dir = './logs',
    )

In [None]:
# Train the model

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
)

In [None]:
trainer.train()

In [None]:
# Save the model
saved_model_path = "/content/finetuned_aitutor_model"
trainer.save_model(saved_model_path)

# Save the tokenizer
tokenizer.save_pretrained(saved_model_path)

In [None]:
def generate_response(model, tokenizer, prompt, max_length=200):

    """
    Generate a response using the fine-tuned model

    :param prompt: Input prompt
    :param max_length: Maximum response length
    :return: Generated text
    """
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        add_special_tokens=True
    ).to(model.device)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7
    )

    # Decode and return response
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Load the fine-tuned model and tokenizer
saved_model_path = "/content/finetuned_aitutor_model"
my_model_finetuned = GPT2LMHeadModel.from_pretrained(saved_model_path)
my_tokenizer_finetuned = GPT2Tokenizer.from_pretrained(saved_model_path)

In [None]:
# Testing

prompt = "What is Artificial Intelligence?"
response = generate_response(my_model_finetuned, my_tokenizer_finetuned, prompt)
print("Generated response:")
response

In [None]:
#Push your fine-tuned model to HuggingFace Model Hub
!huggingface-cli login

In [None]:
# Push model
my_repo = "ai-tutor-towardsai-updated"
my_model_finetuned.push_to_hub(repo_id= my_repo, commit_message= "Upload updated fine-tuned model")

In [None]:
# Push tokenizer
my_tokenizer_finetuned.push_to_hub(repo_id= my_repo, commit_message= "Upload updated tokenizer used")

In [None]:
#Load the model and tokenizer back from Hub and test it with user input prompts
from transformers import AutoModelWithLMHead, AutoTokenizer

my_checkpoint = "chsubhasis/ai-tutor-towardsai-updated"
loaded_model = AutoModelWithLMHead.from_pretrained(my_checkpoint)
loaded_tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)

In [None]:
prompt = "What is Artifician Intelligence?"           # Replace with your desired prompt
response = generate_response(loaded_model, loaded_tokenizer, prompt)
print("Generated response:")
response