# 1. Prepare headset production question answer (PQA) data

In [2]:
import json
import pandas as pd

number_of_rows_to_process = 10000

qa_df = pd.DataFrame(columns=('question', 'answer','label'))

with open('amazon_pqa_headsets.json') as file:
	i = 0
	previous_row_data = None

	for line in file:
		# Load the line into a JSON object
		data = json.loads(line)

		# Add the label of 1 bc. the answer matches the question
		qa_df.loc[i] = [data['question_text'], data['answers'][0]['answer_text'], 1.0]

		i+=1

		if previous_row_data is not None:
			
			# Add the label 0 bc. the answer does not match the question, it matches the previous rows question ...
			qa_df.loc[i] = [data['question_text'],previous_row_data['answers'][0]['answer_text'],0.0]

		previous_row_data = data

		i+=1

		if (i == number_of_rows_to_process*2):
			break

# 2. Split the now labeled dataset into training, validation and test datasets

In [None]:
# Install sentance transfomers
!pip install -U sentence-transformers

In [4]:
from sklearn.model_selection import train_test_split
from sentence_transformers.readers import InputExample

# Shuffled the data in qa_df then split 20% of the total data into test_set and 80% to the train_set  
train_set, test_set = train_test_split(qa_df, test_size=0.2, shuffle=True)

#train_set.to_excel("train_set.xlsx")
#test_set.to_excel("test_set.xlsx")

# Split 20% of the data in train_set into validation_set and 80% into training_set
training_set, validation_set = train_test_split(train_set, test_size=0.2)

#training_set.to_excel("training_set.xlsx")
#validation_set.to_excel("validation_set.xlsx")\

# Create a python list of sentance pairings with a similarity score
def create_input_sample(data_set):
    train_samples = []
    for index, row in data_set.iterrows():
        input_example = InputExample(texts = [ row['question'], row['answer'] ], label=row['label'])
        train_samples.append(input_example)
    return train_samples

training_samples = create_input_sample(training_set)

validation_samples = create_input_sample(validation_set)
test_samples = create_input_sample(test_set)

# 3. Fine tune BERT

In [None]:
!pip install --upgrade torch

In [6]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"

train_batch_size = 16
num_epochs = 1

model_save_path = 'fine_tuned_'+ model_name.replace("/", "-")

word_embedding_model = models.Transformer(model_name)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_dataloader = DataLoader(training_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_samples, name='pqa-valucation')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up

print("Training the model - this could take 30 minutes")

model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)

model = SentenceTransformer(model_save_path)

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='pqa-test')
test_evaluator(model, output_path=model_save_path)

Training the model - this could take 30 minutes


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/273 [00:00<?, ?it/s]

0.4749404868965764