In [1]:
from docx import Document
from PyPDF2 import PdfReader
from pptx import Presentation
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [16]:
import regex

pattern = r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
test_cases = [
    "hellowyn",      # Should split into ["hello", "w", "y", "n"]
    "good",       # Should split into ["good", "w", "y", "n"]
    "wyn",           # Should not split, result: ["wyn"] (no word in front)
    "hello wyn",     # Should not split "wyn", result: ["hello", " ", "wyn"] (space)
    "wynn",          # Should not split, result: ["wynn"] (not exactly "wyn")
    "wynx",          # Should not split, result: ["wynx"] (not "wyn")
    "hello",         # Should not split, result: ["hello"] (no "wyn")
    "it's wyn!",     # Should not split "wyn", result: ["it", "'", "s", " ", "wyn", "!"]
    "unhappy",         # Should split into ["ab", "w", "y", "n"]
]

for text in test_cases:
    matches = regex.findall(pattern, text)
    print(f"Input: {text!r} -> Matches: {matches}")

Input: 'hellowyn' -> Matches: ['hellowyn']
Input: 'good' -> Matches: ['good']
Input: 'wyn' -> Matches: ['wyn']
Input: 'hello wyn' -> Matches: ['hello', ' wyn']
Input: 'wynn' -> Matches: ['wynn']
Input: 'wynx' -> Matches: ['wynx']
Input: 'hello' -> Matches: ['hello']
Input: "it's wyn!" -> Matches: ['it', "'s", ' wyn', '!']
Input: 'unhappy' -> Matches: ['unhappy']




## Data Loading

For This Notebook, I have taken Three different types of data.

    PDF 
    JSON



In [2]:
pdf_file = open('data/Sub-teams and descriptions.pdf','rb')



## Data Extraction

    PDF :- Pdf data is extracted using PyPDF2 and all text is stored in a string.
    JSON

After Extracting all data seperately, I have combined all text in a single string for further text processing.

In [3]:
# extracting pdf data
pdf_text = ""
pdf_reader = PdfReader(pdf_file)
for page in pdf_reader.pages:
    pdf_text += page.extract_text()
print(pdf_text)

1 
 College of Education for Humanities - Department of English 4th year – poetry 2nd course  
 
 “The thought fox”    by  Ted Hughes      
 
I imagine this midnight moment‟s forest:  
Something else is alive  
Beside the clock‟s loneliness  
And this blank page where my fingers move.    
Through the window I see no star:  
Something more near  
Though deeper within darkness  
Is entering the loneliness:  
 
Cold, delicately as the dark snow,  
A fox‟s nose touches twig, leaf;  
Two eyes serve a movement, that now  
And again now, and now, and now  
 
Sets neat prints into the snow  
Between trees, and wari ly a lame  
Shadow lags by stump and in hollow  
Of a body that is bold to come  
 
Across clearings, an eye,  
A widening deepening greenness,  
Brilliantly, concentratedly,  
Coming about its own business  
Till, with a sudden sharp hot stink of fox  
It enters the dark hole of the head.  
The window is starless still; the clock ticks,  
The page is printed.  
  
 
 
 
 
 
 
 2 
 

In [8]:

all_text = pdf_text
len(all_text)

20155



## Chunking

In this step I am creating the chunks of data, for this step I am using Recursive Character Splitter which break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won’t fit in a model’s finite context window.


In [9]:
# splitting the text into chunks for embeddings creation


text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 900, 
        chunk_overlap = 200, # This is helpul to handle the data loss while chunking.
        length_function = len,
        separators=['\n', '\n\n', ' ', '']
    )
    
chunks = text_splitter.split_text(text = all_text)

In [70]:
len(chunks)

30

## Embeddings Creation

Embeddings creation is a crucial preprocessing step in the development of document-based Question and Answering (Q&A) systems. This process involves converting textual data from documents and questions into dense, high-dimensional vectors known as embeddings. These embeddings are designed to capture the semantic meaning of words, sentences, or even entire documents, enabling the Q&A system to understand and process natural language more effectively.

In [10]:
# Initializing embeddings model

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

## Indexing

Indexing data using Facebook AI Similarity Search (FAISS) is a pivotal step in developing efficient and scalable document-based Question and Answering (Q&A) systems. FAISS is a library that facilitates the efficient search for similarities in large datasets, especially useful for tasks involving high-dimensional vectors like text embeddings. When applied to document-based Q&A, FAISS indexes the embeddings of document chunks (e.g., paragraphs, sentences) to optimize the retrieval process.

In [11]:
# Indexing the data using FAISS
vectorstore = FAISS.from_texts(chunks, embedding = embeddings)


## Retriever

In the development of document-based Question and Answering (Q&A) systems, creating a retriever is a crucial step that directly impacts the system's ability to find relevant information efficiently. The retriever utilizes the pre-indexed embeddings of document chunks, searching through them to find the most relevant pieces of content in response to a user query. This process involves setting up a retrieval mechanism that leverages similarity search to identify the best matches for the query embeddings within the indexed data.

In [12]:
# creating retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [28]:
retrieved_docs = retriever.invoke("")

In [29]:
len(retrieved_docs)

2

In [30]:
print(retrieved_docs[0].page_content)

of the most fundam ental characteristics of the human identity.  
Any full investigation of the conflict and of its cultural significance would inevitably need to 
take account both of what Mark Spilka has called „Lawrence‟s quarrel with tenderness‟ and of 
Ian Suttie‟s discussi on of the extent and rigour of the „taboo on tenderness‟ in our own 
culture.[6] But such an investigation would also need to take into consideration a much larger 
cultural context, and perhaps above all to examine the way in which the Christian ideal of lo ve 
has itself traditionally been expressed within the medium of violent apocalyptic fantasies.  
The investigation which I describe is clearly beyond the scope of this essay. My more modest 
aim here has been to draw attention to the role which is played by t his conflict in two of the




LLM Models

    Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand, generate, and interact with human language in a way that mimics human-like understanding. These models are trained on vast amounts of text data, allowing them to grasp the nuances of language, including grammar, context, and even cultural references. The capabilities of LLMs extend beyond simple text generation; they can perform a variety of tasks such as translation, summarization, question answering, and even code generation.
    One of the key technologies behind LLMs is the Transformer architecture, which enables the model to pay attention to different parts of the input text differently, thereby understanding the context and relationships between words and phrases more effectively. This architecture has led to significant improvements in natural language processing tasks and is the foundation of many state-of-the-art LLMs.





## LLM


In [31]:
from langchain.prompts import PromptTemplate

prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                not contained in the context, say "answer not available in context" \n\n
                Context: \n {context}?\n
                Question: \n {question} \n
                Answer:"""

prompt = PromptTemplate.from_template(template=prompt_template)



In [25]:


# function to create a single string of relevant documents given by Faiss.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [33]:
# RAG Chain

def generate_answer(question):
    # Initialize local LLM
    model_id = "distilgpt2"  # Lightweight, free model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        device=-1
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain.invoke(question)

## Results


In [35]:
ans = generate_answer("tell me about the fox poem")
print(ans)

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 

## Conclusion