In [78]:
import os
import pdfplumber
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import markdown
from bs4 import BeautifulSoup
import re
import unicodedata
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fati1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Document Parsing

### Data Loading 

File Path here

In [79]:
file_path = "D:\Tarot-Card-Meanings.pdf"

  file_path = "D:\Tarot-Card-Meanings.pdf"


Get the file extension and parse it accordingly

In [80]:
# get file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[-1].lower()


In [81]:
# use the appropriate text parser based on file extension (pdf, markdown, txt)
def parse_file(file_path):
    file_extension = get_file_extension(file_path)
    
    if file_extension == '.pdf':
        with pdfplumber.open(file_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() + '\n'
            return text
    elif file_extension == '.md' or file_extension == '.markdown':
        with open(file_path, "r", encoding="utf-8") as file:
            html = markdown.markdown(file.read())
        return BeautifulSoup(html, "html.parser").get_text()
    elif file_extension == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    

    

In [82]:
raw_text = parse_file(file_path)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

### Preprocessing

Cleaning the text

In [83]:

def clean_text(text):
    # Normalize line breaks and spaces
    text = re.sub(r'\r\n|\r', '\n', text)           # Convert \r\n or \r to \n
    text = re.sub(r'\n{2,}', '\n\n', text)          # Collapse many newlines into 2
    text = re.sub(r'[ \t]+', ' ', text)             # Remove extra spaces/tabs

    # Normalize unicode 
    text = unicodedata.normalize("NFKD", text)
    def add_period_to_bullet(match):
        line = match.group(0).strip()
        if not line.endswith('.'):
            return line + '.'
        return line

    # Add periods to lines that start with bullet markers (before removing markers)
    text = re.sub(r'(?m)^\s*[-*+]\s+(.*)', lambda m: "- " + add_period_to_bullet(m), text)
    # Remove common bullet points
    text = re.sub(
        r'[\u2022\u2023\u25E6\u2043\u2219\u25AA\u25AB\u25CB\u25CF\u25A0\u25B8\u29BE\u29BF]',
          '', text)

    # Remove markdown or ASCII-style tables
    text = re.sub(r'\|.*?\|', '', text)      # Remove markdown tables
    text = re.sub(r'[-=]{3,}', '', text)     # Remove underlines in tables
    text = re.sub(r'^\s*[\-\*+]\s+', '', text, flags=re.MULTILINE)  # Bulleted list lines

    # Remove figure/table/image captions
    text = re.sub(r'(Figure|Table|Image|Chart|Diagram)\s*\d+[\.:]?', '', text, flags=re.IGNORECASE)

    # Remove bracketed footnotes like [1], [12], (Fig. 3), etc.
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\(.*?fig.,\'*?\)', '', text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Fix line breaks and hyphens split across lines
    text = re.sub(r'-\n', '', text)  # Remove hyphenated line-breaks
    text = re.sub(r'\n+', '\n', text)  # Collapse newlines
    text = re.sub(r'[ \t]+', ' ', text)  # Normalize spaces

    # Strip remaining non-ASCII or odd symbols
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # before every \n add a period if it doesn't end with one special character
    text = re.sub(r'(?<![.!?:])\n', '. \n', text)     

    return text.strip()


## Chunking

Download the embedding model from hugging face

In [84]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

We will be using the semantic Chunker from langchain

In [85]:
semantic_chunker = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")


In [86]:
chunks = semantic_chunker.split_text(text)



In [87]:
metadatas = [
    {
        "source": file_path,
        "chunk_index": i,
        "length": len(chunks[i])

    }
    for i in range(len(chunks))
]


## Vector Storage

In [88]:
db = Chroma(
    persist_directory="chroma_store",
    embedding_function=embedding_model
)

In [89]:
docs = list(set(chunks))  
db.add_texts( texts=chunks, metadatas=metadatas)

['a0759bfd-b9f8-4548-b58b-8ddc0e3246c2',
 '17fbfde6-1c29-42ca-8481-c50d6e542e72',
 '2fb0f32b-8e78-4363-9d52-a37b50e6e236',
 '23f57c5f-1357-4fea-b50b-76942e2ef984',
 'a7520d4b-6904-4966-8812-2a1015dd30b8',
 '0d636eb6-a801-46c7-a549-4ff5f55d1621',
 '933b25b0-396f-4041-9463-9661e09ab675',
 '83fbe326-94d8-44f3-93cf-a395586ad542',
 'd012e2fb-e711-4718-8273-d1114a1c3aed',
 '51a01471-a773-4cfe-ae61-a2100c49190e',
 '3ebb3244-40b3-411e-b00c-3486d9139890',
 'd9e6f6d3-72ad-4d28-9a5a-c7f77357dcc6',
 '08dd0a60-5ea7-46d0-979f-2f23126119c5',
 'ded89f22-63c4-4b6e-90e2-d19a5aed7d77',
 'd51583c1-0da5-45d8-84e9-8da9d0e2799a',
 'b3d3b483-5595-4016-8d6c-899a50ffbe7f',
 'cfae656e-569e-43eb-9e20-016d569413d8',
 'b9238a14-211e-4aa8-8f68-559b8a4c1d17',
 '42bbb5e8-e49a-41be-bc57-8f8d3fb614e7',
 '5f8c983f-707d-4c0c-b15e-3d87d8b89cc3',
 '4259d1bd-8041-467c-b04b-e0707eacc283',
 'dd0e76b4-399a-4cda-8649-3fd328455b13',
 'c535228a-f1f2-4cac-823f-0d258a818b1c',
 '04b844ce-be8c-4cc1-9ece-4bc06b961d1f',
 'bb4085f6-a3e8-

In [90]:
db.persist()

## General Query Answering

### Query Embedding

Embedd the query

In [91]:
query = "What does the card \"The Fool\" mean ?"
query_embedding = embedding_model.embed_query(query)

### Similarity Search

In [92]:
# make the chroma store act like a retriever
retriever = db.as_retriever()

In [93]:
# perform the similarity search to get the most relevant chunks
results = retriever.get_relevant_documents(query)

In [94]:

print(f"Query: {query}\n")
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.page_content}\n")
    print(f"Metadata: {result.metadata}\n")

Query: What does the card "The Fool" mean ?

Result 1:
Liz Dean 2018
Tarot Card Meanings
For easy reference and to help you get started with your readings, in the following pages I
have produced a short divinatory meaning for each card. You will find lists of meanings for
the Major Arcana and the Minor Arcana suits of Wands, Pentacles, Swords and Cups. Have fun  
Liz Dean
P a g e | 2
  Liz Dean 2018
The Major Arcana
0 The Fool says: Look before you leap! It s time for a new adventure, but there is a level of risk. Consider your options carefully,
and when you are sure, take that leap of faith. Home: If you are a parent, The Fool can show a young person leaving home. Otherwise, it
predicts a sociable time, with lots of visitors   who may also help you with a new project. Love and Relationships: A new path takes you towards love; this card often appears after a
break-up. Career and Money: A great opportunity awaits. Seize it while you can. Spiritual Development: New discoveries. You are 

### Generating Response

In [95]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

# Wrap it with LangChain
llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu


Now generating the final prompt by combining the query and the chunks

In [96]:
def trim_context_to_token_limit(docs, tokenizer, max_tokens):
    context = ""
    total_tokens = 0

    for doc in docs:
        doc_text = doc.page_content.strip() if hasattr(doc, 'page_content') else str(doc)
        doc_tokens = tokenizer(doc_text, return_tensors='pt', truncation=False)['input_ids'][0]
        if total_tokens + len(doc_tokens) <= max_tokens:
            context += doc_text + "\n\n"
            total_tokens += len(doc_tokens)
        else:
            break
    return context.strip()

In [97]:
context = trim_context_to_token_limit(results, tokenizer, 512)

In [98]:

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Use the following context to answer the question at the end. 
    Even if the question is not directly answered in the context, say "It's not clearly mentioned but my best guess is"
    and use the context to provide a guess.
    Give a detailed answer based on the context provided.
    Context:
    {context}

    Question:
    {question}

    Answer:"""
)


In [99]:
from langchain.chains import LLMChain

rag_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)


In [100]:

response = rag_chain.run({
    "context": context,
    "question": query
})

print(response)


The Fool says: Look before you leap! It s time for a new adventure, but there is a level of risk. Consider your options carefully, and when you are sure, take that leap of faith. Home: If you are a parent, The Fool can show a young person leaving home. Otherwise, it predicts a sociable time, with lots of visitors who may also help you with a new project. Career and Money: A great opportunity awaits. Seize it while you can. Spiritual Development: New discoveries. You are finding your soul 's path Is he upside down? Beware false promises and naivete.


## Summarization

In [101]:
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
summarization_model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer_sum = AutoTokenizer.from_pretrained(summarization_model_name)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)

pipe_sum = pipeline(
    "text2text-generation",
    model=model_sum,
    tokenizer=tokenizer_sum,
    max_length=1024
)

# Wrap it with LangChain
llm_sum = HuggingFacePipeline(pipeline=pipe_sum)


Device set to use cpu


In [102]:

def get_summary(vectorstore="", llm_sum=llm_sum, k=20):
    vectorstore = db
    collection = vectorstore._collection
    all_docs = collection.get(include=["documents", "embeddings"])
    
    embeddings = np.array(all_docs["embeddings"])
    documents = all_docs["documents"]

    centroid = np.mean(embeddings, axis=0).reshape(1, -1)
    salience_scores = np.linalg.norm(embeddings - centroid, axis=1)

    lower_bound = np.percentile(salience_scores, 5)
    upper_bound = np.percentile(salience_scores, 86)

    mid_band_indices = [
        i for i, score in enumerate(salience_scores)
        if lower_bound < score < upper_bound
    ]

    mid_band_indices.sort(key=lambda i: salience_scores[i], reverse=True)
    k = min(k, len(mid_band_indices))
    salient_chunks = [documents[i] for i in mid_band_indices[:k]]
    # if a token is larger than 500 characters, get rid of it
    salient_chunks = [chunk for chunk in salient_chunks if len(chunk) <= 500]

    text = trim_context_to_token_limit(
        salient_chunks, tokenizer_sum, 1024
    )
    summary = llm_sum.invoke(text)
    return summary


print(
    get_summary(db, llm_sum)
)



 Suit of Wands are associated with the element of fire . They represent heat, passion and creativity . When reading be mindful of burning desires and what ignites passion . Self-centeredness when what s needed is generosity; egotism . You may feel ignored by someone busy feathering his or her own nest .


## Empty Collection

In [103]:
db.delete_collection()