In [None]:
# !pip install -Uq sentence-transformers

## Load Configurations & Libraries

In [7]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
## If we run this machine on GPU machine it will be faster.
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# @title Load Configuraiton 
import hbqaconfig
conf = hbqaconfig.setEnv('local')
embed_now = True
# for k,v in conf.items(): print (k,":",v)

In [None]:
# !pip install huggingface
# import huggingface

# !huggingface-cli login

In [None]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

DEVICE

## Selecting and Loading Embedding Model.
- Different models are created on different kind of corpus
- Different vector size
- Some are normailzied, in that case dot product and cosine is same. Dot is less costly

In [3]:
#Select Model Function

# https://www.sbert.net/docs/pretrained_models.html

#250MB, multi-qa-distilbert-cos-v1',  Max Sequence Length:	512, Dimensions:768, Normalized Embeddings:	true
#80MB, all-MiniLM-L6-v2, Max Sequence Length:	256, Dimensions:	384, Normalized Embeddings:	true
#290MB, all-distilroberta-v1, Max Sequence Length:	512, Dimensions:	768, Normalized Embeddings:	true
#420MB, all-mpnet-base-v2, Max Sequence Length:	384, Dimensions:	768, Normalized Embeddings:	true
#1.36GB, all-roberta-large-v1, Max Sequence Length:	256, Dimensions: 1024, Normalized Embeddings:	true

def select_embmodel(num):
    emb_modelshortlist = ['distilbert','minilm','distilroberta','mpnet','roberta']

    emb_modellist = ['multi-qa-distilbert-cos-v1',
                'all-MiniLM-L6-v2',
                'all-distilroberta-v1',
                'multi-qa-mpnet-base-dot-v1',
                'all-roberta-large-v1']

    embmodelname = emb_modellist[num]
    embmodelshort = emb_modelshortlist[num]
    embmodelname1 = "_" + embmodelname

    print (embmodelname,'\t',embmodelshort,'\t', embmodelname1)
    return embmodelname, embmodelshort, embmodelname1

In [4]:
embmodelname, embmodelshort, embmodelname1 = select_embmodel(3)

multi-qa-mpnet-base-dot-v1 	 mpnet 	 _multi-qa-mpnet-base-dot-v1


In [8]:
if embed_now:
  embmodel = SentenceTransformer(embmodelname)

Downloading (…)16ebc/.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 47.2kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 12.2kB/s]
Downloading (…)b6b5d16ebc/README.md: 100%|██████████| 8.65k/8.65k [00:00<00:00, 2.12MB/s]
Downloading (…)b5d16ebc/config.json: 100%|██████████| 571/571 [00:00<00:00, 83.2kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 7.43kB/s]
Downloading (…)ebc/data_config.json: 100%|██████████| 25.5k/25.5k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [03:46<00:00, 1.93MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 123kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<?, ?B/s] 
Downloading (…)16ebc/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.42MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<?, ?B/s] 
Downloading (…)6ebc/train_script.py: 100%|██████████| 13.9k/13.9k [0

# Util Functions

In [10]:
# When some function is decorated with this function. It will show the time to execute that function.
# we will use this calculate the time for embedding.
def get_time(func):
    import time
    def wrapper():
        start_time = time.time()

        result = func()

        end_time = time.time()
        time_spent=end_time-start_time

        print(f"Time Taken to Process {time_spent}")
        return result

    return wrapper

# Creating Chunk Embedding

## Load Chunk Dataset

In [19]:
FileToProcess = '06.10-ChatGPTAPI_Chunk.csv' #This is based on Chapter 01, 03 Parva
df= pd.read_csv(conf['QAGS_FOLDER']+FileToProcess) #This CSV is created from google drive file. It has Chunk info.

print(df.shape)
# df = df.loc[df['ChatGPT_Response'].notna(),:]
# df = df.loc[df['ChatGPT_Response'].str.len()>10,:]
# print(df.shape)

(285, 3)


In [20]:
df.tail(10)

Unnamed: 0,Chunk_Id,Section_Id,Chunk
275,275,Book03_210,"A1:Section CCX\n""Markandeya continued, 'O Bhar..."
276,276,Book03_212,"A1:Section CCXII\n""The Brahmana enquired, 'How..."
277,277,Book03_213,"A1:Section CCXIII\n""Markandeya said, 'When, O ..."
278,278,Book03_214,"A1:Section CCXIV\n""Markandeya continued, 'The ..."
279,279,Book03_215,"A1:Section CCXV\n""The fowler continued, 'Thus ..."
280,280,Book03_218,"A1:Section CCXVIII\n""Markandeya continued, 'Vr..."
281,281,Book03_220,"A1:Section CCXX\n""Markandeya continued, 'The f..."
282,282,Book03_221,"A1:Section CCXXI\nMarkandeya continued, ""Mudit..."
283,283,Book03_223,"A1:Section CCXXIII\n""The lady replied, 'I am a..."
284,284,Book03_224,"A1:Section CCXXIV\n""Markandeya continued, 'O l..."


In [23]:
df = df.set_index('Chunk_Id')

In [24]:
df.head(3)

Unnamed: 0_level_0,Section_Id,Chunk
Chunk_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Book01_002,THE MAHABHARATA ADI PARVA Section I\nOm! Havin...
1,Book01_002,The Rishi Vyasa published this mass of knowled...
2,Book01_002,"Vyasa executed the compilation of the Bharata,..."


In [26]:
# Convert Chunk colum values to List.
@get_time
def get_chunk_vector():
  Chunk_Sentences = df.Chunk.tolist()
  Chunk_Embeddings = embmodel.encode(Chunk_Sentences)
  return Chunk_Embeddings

Chunk_Embeddings = get_chunk_vector()

Time Taken to Process 831.6819748878479


In [17]:
# What is the type of Chunk_Embeddings?
# type(Chunk_Embeddings)

In [27]:
# Convert These embedding into torch tensor and load into available DEVICE
ChunkVectors= torch.tensor(Chunk_Embeddings, dtype=torch.float).to(DEVICE)

In [19]:
# Flatten the tensors into 1D arrays
# ChunkVec_list = ChunkVectors.tolist()

In [28]:
# Flatten the tensors into 1D arrays/list
# Create ChunkVector in dataset.'
df['ChunkVector'+embmodelname1 ] = ChunkVectors.tolist()

In [29]:
print(df.shape)
df.head(3)

(285, 3)


Unnamed: 0_level_0,Section_Id,Chunk,ChunkVector_multi-qa-mpnet-base-dot-v1
Chunk_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Book01_002,THE MAHABHARATA ADI PARVA Section I\nOm! Havin...,"[-0.014535784721374512, -0.08219598978757858, ..."
1,Book01_002,The Rishi Vyasa published this mass of knowled...,"[0.2731589078903198, -0.15328504145145416, -0...."
2,Book01_002,"Vyasa executed the compilation of the Bharata,...","[-0.019269350916147232, -0.4155203402042389, -..."


In [31]:
# Save the Vector on harddisk.
filenm = '06.10-ChatGPTAPI_Chunk_Vector'+embmodelname1+'.csv'
df.to_csv(conf['QAGS_FOLDER'] + filenm, index=None)
print(embmodelname)

multi-qa-mpnet-base-dot-v1


# Question Embedding

In [32]:
FileToProcess = '06.12-ChatGPTAPI_QA_with_Chunk.csv' #This is based on Chapter 01, 03 Parva
df= pd.read_csv(conf['QAGS_FOLDER']+FileToProcess) #This CSV is created from google drive file. It has Chunk info.

print(df.shape)

(1084, 8)


In [33]:
# Convert Chunk colum values to List.
@get_time
def get_qa_vector():
  Question_Sentences = df.Question.tolist()
  Ref_Answer_Sentences = df.Ref_Answer.tolist()
  Question_Embeddings = embmodel.encode(Question_Sentences)
  Ref_Answer_Embeddings = embmodel.encode(Ref_Answer_Sentences)
  return Question_Embeddings, Ref_Answer_Embeddings

Question_Embeddings, Ref_Answer_Embeddings = get_qa_vector()

Time Taken to Process 361.96942472457886


In [34]:
# Convert These embedding into torch tensor and load into available DEVICE
QuestionVectors= torch.tensor(Question_Embeddings, dtype=torch.float).to(DEVICE)
Ref_AnswerVectors= torch.tensor(Ref_Answer_Embeddings, dtype=torch.float).to(DEVICE)

# Flatten the tensors into 1D arrays/list
# Create ChunkVector in dataset.'
df['QuestionVector'+embmodelname1 ] = QuestionVectors.tolist()
df['Ref_AnswerVector'+embmodelname1 ] = Ref_AnswerVectors.tolist()

In [36]:
# Save the Vector on harddisk.
filenm = "06.12-ChatGPTAPI_QA_Vector" + embmodelname1 +".csv"
df.to_csv(conf['QAGS_FOLDER'] + filenm, index=None)

print(embmodelname)

multi-qa-mpnet-base-dot-v1


In [37]:
df.head(3)

Unnamed: 0,Ques_Id,Chunk_Id,Section_Id,Question,Ref_Answer,Chunk,WordsInQues,WordsInAns,QuestionVector_multi-qa-mpnet-base-dot-v1,Ref_AnswerVector_multi-qa-mpnet-base-dot-v1
0,0,0,Book01_002,Who asked Sauti about his journey?,One of the Rishis beginning the conversation a...,THE MAHABHARATA ADI PARVA Section I\nOm! Havin...,6,12,"[0.2826710641384125, -0.3960507810115814, -0.3...","[0.028256800025701523, -0.1837993562221527, -0..."
1,1,0,Book01_002,What did the Rishis wish to hear from Sauti?,The Rishis wished to hear the wonderful narrat...,THE MAHABHARATA ADI PARVA Section I\nOm! Havin...,9,10,"[0.47221648693084717, -0.4937230348587036, -0....","[0.246647909283638, -0.41256460547447205, -0.3..."
2,2,0,Book01_002,What did the Rishis ask Sauti to recite?,The Rishis asked Sauti to recite the sacred st...,THE MAHABHARATA ADI PARVA Section I\nOm! Havin...,8,32,"[0.4244302809238434, -0.7060792446136475, -0.3...","[0.21940578520298004, -0.16441603004932404, -0..."
