In [4]:
import PyPDF2
import re
import warnings
from sentence_transformers import SentenceTransformer, util
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
            text += "\n\n"  # Add two newline characters to separate pages

    # Add delimiter before each section title
    text = text.replace("\n\n", "\n     \n\n")  # Replace double newline characters with a delimiter

    # Add new paragraph after the text "Question"
    text = text.replace("Question", "\nQuestion\n")

    return text

def segment_text(text):
    # Improved pattern to match various newline combinations
    pattern = r"\n{2,}"  # Matches two or more consecutive newlines
    paragraphs = re.split(pattern, text)
    return paragraphs

def embed_sentence(sentence):
    """Embeds a sentence into a vector using the sentence transformer model"""
    sentence_embeddings = model.encode(sentences=[sentence])
    return sentence_embeddings[0]  # Extract the first embedding (for single sentence)

def search_with_similarity(query, text):
    """
    Searches for relevant passages based on contextual similarity using sentence transformers.

    Args:
        query: The user's query as a string.
        text: The segmented text from the PDF document (list of paragraphs).

    Returns:
        A list of the top K most similar passages to the query.
    """
    passages = []
    query_embedding = embed_sentence(query)

    for paragraph in text:
        passage_embedding = embed_sentence(paragraph)
        similarity_score = util.cos_sim(query_embedding, passage_embedding)
        passages.append((paragraph, similarity_score))

    # Sort passages by similarity score in descending order (most similar first)
    passages.sort(key=lambda x: x[1], reverse=True)

    # Return only the top K passages
    return passages[:2]

def generate_text(prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.95, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, num_return_sequences=num_return_sequences)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_response(passages, max_length=200, temperature=0.7, top_k=50, top_p=0.95, num_return_sequences=1):
    combined_passages = " ".join([passage[0] for passage in passages])
    input_text = combined_passages[:512]  # Limit input length for GPT-2 model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, 
                                temperature=temperature, top_k=top_k, top_p=top_p, 
                                num_return_sequences=num_return_sequences, pad_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(output[0], skip_special_tokens=True)

warnings.filterwarnings('ignore')

# Path to the PDF document
pdf_path = r"C:\Users\chaitanya\Downloads\areete\Knowledge base for RAG-Handbook-of-Good-Dairy-Husbandry-Practices_.pdf"

# Extract text from the PDF document
pdf_text = extract_text_from_pdf(pdf_path)

# Segment text into paragraphs
text = segment_text(pdf_text)

# Load the pre-trained sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')  # Replace with your desired model
# Example usage (assuming you have segmented text 'text')
query = "PREVENTION OF CALF DIARRHOEA"
relevant_passages = search_with_similarity(query, text)
# Load pre-trained GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")



# Generate text based on retrieved passages
generated_response = generate_response(relevant_passages)

# Print generated response
print("Generated Response:", generated_response)




Generated Response: 11
B. CALF DIARRHOEA
 yDiarrhoea in calves can occur due to various reasons.
 yCalves with diarrhoea lose considerable amounts of water and electrolytes.
 yDiarrhoea and rapid loss of fluid and ions can cause the calf to die very quickly.
MANAGEMENT OF CALF DIARRHOEA
 yReplace the lost water and electrolytes at the 
earliest - Feed 2-4 litres of electrolyte solution every day.
 yThe electrolyte solution provided should be over and above the normal feeding.
 yConsult a veterinarian at the earliest to determi-

tion.

YDiarrhoea can be prevented by following the following steps:

1. Feed 2-4 litres of electrolyte solution every day.

2. Feed 2-4 litres of electrolyte solution every day.

3. Feed 2-4 litres of electrolyte solution every day
