In [92]:
import pandas as pd
import numpy as np
import PyPDF2
from PyPDF2 import PdfMerger
import re
from nltk.corpus import stopwords
from chromadb import Client, Settings
from chromadb.utils import embedding_functions
from openai import OpenAI  
from dotenv import load_dotenv
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

In [3]:
#CONFIG DATA
root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)
Code_Path = os.path.abspath(os.curdir)
Data_Path = os.path.join(root_dir,"Data")
Input_Data = os.path.join(Data_Path,"Input_PDF")

#ef = embedding_functions.ONNXMiniLM_L6_V2()
#client = Client(settings = Settings(persist_directory="./", is_persistent=True))
#collection_ = client.get_or_create_collection(name="SinglePDFRead", embedding_function=ef)

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
#create chuncks to store the text into number of chunks

def chunkstring(string: str, length: int):
    return (string[0+i:length+i] for i in range(0, len(string), length))

In [5]:
def read_pdf(directory: str,length: int):
        
    # Initialize an empty dictionary to store the extracted text chunks
    documents = {}
    
    for filenames in os.listdir(directory):  #List all the files
        #iterate through each PDF Files
        if filenames.lower().endswith('.pdf'):
            #creating a pdf reader object
            reader = PyPDF2.PdfReader(os.path.join(directory,filenames))
                
            # Iterate through each page in the PDF
            for page_no in range(len(reader.pages)):
                # get current page
                page = reader.pages[page_no]
                #Extract text from current page
                texts = page.extract_text()
        
                #Preprocess the text 

                #Text = texts.lower()
                #Removing Unicode character
                #Text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", Text)
                #Removing Stop Words
                #stop = stopwords.words("english")
                #Text = " ".join([word for word in Text.split() if word not in (stop)])
        
                #Convert Text into chunks
                chunks = chunkstring(texts,length)
        
                # Store the text chunks in the documents dictionary with the page number as the key
                documents[page_no] = chunks    
    
    # Return the dictionary containing page numbers as keys and text chunks as values
    return documents

In [6]:
def tokenize(txt):
    nlp_ob = nlp(txt)
    #extracting tokens and storing it in list
    tokens = [token.text for token in nlp_ob]
    vectors = [token.vector for token in nlp_ob]
    return vectors

In [90]:
def add_text_to_collection(directory: str,length: int):
    #Load the pdf file and extract text into chunks
    data = read_pdf(directory,length)
    
    embedding_data = {}
    # Initialize empty lists to store data
    docs_strings = []  # List to store text chunks
    ids = []  # List to store unique IDs
    metadatas = []  # List to store metadata for each text chunk
    id = 0  # Initialize ID
    
   #iterate through each page and text chunk in pdf

    for page_no in data.keys():
        for doc in data[page_no]:
            #append the text chunk in docs_strings
            processed_doc = doc.replace("\n","")
            docs_strings.append(processed_doc)
        
            #Append meta data for text chunks in metadatas
            metadatas.append({'Page no': page_no})
        
            #Append unique ids for the text chunks
            ids.append(id)
        
            #increment the ids +1
            id += 1
    
    
    # Use TF-IDF vectorizer to convert text to vectors
    vectorizer = TfidfVectorizer()
    embedding_vectors = vectorizer.fit_transform(docs_strings)
    
    embedding_data['ids'] = ids
    embedding_data['documents'] = docs_strings
    embedding_data['metadatas'] = metadatas
    embedding_data['Vectors'] = embedding_vectors
    
    return embedding_data

In [60]:
embedded_data = add_text_to_collection(Input_Data,1000)

# GPT-2 Model Training

In [91]:
def query_collection(question,answer,embedding_model):
    embedded_data = add_text_to_collection(Input_Data,1000)
    
    # Use TF-IDF vectorizer to convert text to vectors
    vectorizer = TfidfVectorizer()
    question_vectors = vectorizer.fit_transform([question])
    
    answer_vectors = vectorizer.transform(answer)
    
    # Compute cosine similarity between the question and each answer
    similarities = string_similarity(question, answer, embedding_model)
    
    # Get the index of the best answer
    best_answer_index = np.argmax(similarities)
    
    return answer[best_answer_index]

In [67]:
query_collection("What is Machine Learning",embedded_data.get("documents"),embedding_model)

'Understanding Body Movement '

In [38]:
# Open AI Model 
def get_response(queried_texts: list[str],):
    global messages
    client = OpenAI(api_key="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")
    
    messages = [
                {"role": "system", "content": "You are a helpful assistant.\
                 And will always answer the question asked in 'ques:' and \
                 will quote the page number while answering any questions,\
                 It is always at the start of the prompt in the format 'page n'."},
                {"role": "user", "content": ''.join(queried_texts)}
          ]

    response = client.chat.completions.create(
                            model = "gpt-3.5-turbo",
                            messages = messages,
                            temperature=0.2,               
                     )
    response_msg = response.choices[0].message.content
    messages = messages + [{"role":'assistant', 'content': response_msg}]
    return response_msg

In [39]:
def get_answer(query: str):
    queried_texts = query_collection(question = query,answer = embedded_data.get('documents'))
    queried_string = [''.join(text) for text in queried_texts]
    queried_string = queried_string[0] + f"ques: {query}"
    answer = get_response(queried_texts = queried_string,)
    ignore = ['Page','page']
    final_answer = ' '.join(i for i in answer.split(' ') if not i.endswith(':') and i not in ignore)
    return final_answer.replace('\n','')

In [51]:
question = "What is Yoga and different types"

In [52]:
answer = get_answer(question)