Installing the dependencies

In [None]:
!pip install transformers==4.17   #using this specfic version of transformers leads to no error
!pip install torch
!pip install pandas
!pip install sklearn
!pip install datasets
!pip install rouge_score
!pip install nltk

Importing the dependencies

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
from datasets import load_metric
import nltk
nltk.download('punkt')

Pre-processing the dataset

In [None]:
#load the datasets

df= pd.read_csv("intern_screening_dataset.csv")
df.head()

#removing the duplicate rows

df=df.drop_duplicates(subset=["question"])

#splitting the datset into train and test Seq2SeqTrainer

train_df, val_df = train_test_split(df, test_size=0.2)

#tokenizing the given datasets

tokenizer = AutoTokenizer.from_pretrained("t5-small")

#defining the tokenization function

def tokenize_function(examples_ques,examples_ans):
    inputs = tokenizer(examples_ques, padding='max_length', truncation=True, return_tensors='pt')
    labels = tokenizer(examples_ans, padding='max_length', truncation=True, return_tensors='pt')
    inputs.pop('token_type_ids', None)
    labels.pop('token_type_ids', None)
    inputs['labels'] = labels['input_ids']
    return inputs

#converting the datasets into lists to pass it through the tokenization function

train_examples= train_df.to_numpy().tolist()
val_examples= val_df.to_numpy().tolist()

train_ques=[]
train_ans=[]
val_ques=[]
val_ans=[]
for it in train_examples:
  train_ques.append(str(it[0]))
  train_ans.append(str(it[1]))
for it in val_examples:
  val_ques.append(str(it[0]))
  val_ans.append(str(it[1]))

#tokenizing the data

train_encodings = tokenize_function(train_ques,train_ans)
val_encodings = tokenize_function(val_ques,val_ans)

In [None]:
print(train_encodings['labels']) #ensuring labels have been made

In [None]:
#Creating Dataset class
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != 'token_type_ids'}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = QADataset(train_encodings)
val_dataset = QADataset(val_encodings)

Model Training and Evaluation

In [None]:
#Loading the model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

#Setting up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    predict_with_generate=True  # This ensures that the generate method is used for predictions
)

#Loading evaluation metrics
bleu_metric = load_metric('bleu')
rouge_metric = load_metric('rouge')

#Defining compute metrics function
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    
    # Decode generated texts
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Compute BLEU
    pred_tokens = [nltk.word_tokenize(s) for s in pred_str]
    label_tokens = [[nltk.word_tokenize(s)] for s in label_str]
    bleu = bleu_metric.compute(predictions=pred_tokens, references=label_tokens)
    
    # Compute ROUGE
    rouge = rouge_metric.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rougeL"])
    
    return {
        'bleu': bleu['bleu'],
        'rouge1': rouge['rouge1'].mid.fmeasure,
        'rouge2': rouge['rouge2'].mid.fmeasure,
        'rougeL': rouge['rougeL'].mid.fmeasure,
    }

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [None]:
#saving the model

model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

Inference

In [None]:
from transformers import pipeline

model = AutoModelForSeq2SeqLM.from_pretrained('./saved_model')
tokenizer = AutoTokenizer.from_pretrained('./saved_model')
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def get_answer(question):
    return qa_pipeline(question)[0]['generated_text']

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break
    response = get_answer(user_input)
    print("QABot:", response)