In [1]:
import os
import pdfplumber
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import markdown
from bs4 import BeautifulSoup
import re
import unicodedata
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fati1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Document Parsing

### Data Loading 

File Path here

In [2]:
file_path = "D:\Tarot-Card-Meanings.pdf"

  file_path = "D:\Tarot-Card-Meanings.pdf"


Get the file extension and parse it accordingly

In [3]:
# get file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[-1].lower()


In [4]:
# use the appropriate text parser based on file extension (pdf, markdown, txt)
def parse_file(file_path):
    file_extension = get_file_extension(file_path)
    
    if file_extension == '.pdf':
        with pdfplumber.open(file_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() + '\n'
            return text
    elif file_extension == '.md' or file_extension == '.markdown':
        with open(file_path, "r", encoding="utf-8") as file:
            html = markdown.markdown(file.read())
        return BeautifulSoup(html, "html.parser").get_text()
    elif file_extension == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    

    

In [5]:
raw_text = parse_file(file_path)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

In [6]:
print(raw_text)


© Liz Dean 2018
Tarot Card Meanings
For easy reference and to help you get started with your readings, in the following pages I
have produced a short divinatory meaning for each card. You will find lists of meanings for
the Major Arcana and the Minor Arcana suits of Wands, Pentacles, Swords and Cups.
Have fun ☺
Liz Dean
P a g e | 2
© Liz Dean 2018
The Major Arcana
0 The Fool says: Look before you leap!
It’s time for a new adventure, but there is a level of risk. Consider your options carefully,
and when you are sure, take that leap of faith.
Home: If you are a parent, The Fool can show a young person leaving home. Otherwise, it
predicts a sociable time, with lots of visitors – who may also help you with a new project.
Love and Relationships: A new path takes you towards love; this card often appears after a
break-up.
Career and Money: A great opportunity awaits. Seize it while you can.
Spiritual Development: New discoveries. You are finding your soul’s path
Is he upside down? Beware f

### Preprocessing

Cleaning the text

In [7]:

def clean_text(text):
    # Normalize line breaks and spaces
    text = re.sub(r'\r\n|\r', '\n', text)           # Convert \r\n or \r to \n
    text = re.sub(r'\n{2,}', '\n\n', text)          # Collapse many newlines into 2
    text = re.sub(r'[ \t]+', ' ', text)             # Remove extra spaces/tabs

    # Normalize unicode 
    text = unicodedata.normalize("NFKD", text)
    def add_period_to_bullet(match):
        line = match.group(0).strip()
        if not line.endswith('.'):
            return line + '.'
        return line

    # Add periods to lines that start with bullet markers (before removing markers)
    text = re.sub(r'(?m)^\s*[-*+]\s+(.*)', lambda m: "- " + add_period_to_bullet(m), text)
    # Remove common bullet points
    text = re.sub(
        r'[\u2022\u2023\u25E6\u2043\u2219\u25AA\u25AB\u25CB\u25CF\u25A0\u25B8\u29BE\u29BF]',
          '', text)

    # Remove markdown or ASCII-style tables
    text = re.sub(r'\|.*?\|', '', text)      # Remove markdown tables
    text = re.sub(r'[-=]{3,}', '', text)     # Remove underlines in tables
    text = re.sub(r'^\s*[\-\*+]\s+', '', text, flags=re.MULTILINE)  # Bulleted list lines

    # Remove figure/table/image captions
    text = re.sub(r'(Figure|Table|Image|Chart|Diagram)\s*\d+[\.:]?', '', text, flags=re.IGNORECASE)

    # Remove bracketed footnotes like [1], [12], (Fig. 3), etc.
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\(.*?fig.*?\)', '', text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Fix line breaks and hyphens split across lines
    text = re.sub(r'-\n', '', text)  # Remove hyphenated line-breaks
    text = re.sub(r'\n+', '\n', text)  # Collapse newlines
    text = re.sub(r'[ \t]+', ' ', text)  # Normalize spaces

    # Strip remaining non-ASCII or odd symbols
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
   
    return text.strip()


In [8]:
text = clean_text(raw_text)
print(text)

Liz Dean 2018
Tarot Card Meanings
For easy reference and to help you get started with your readings, in the following pages I
have produced a short divinatory meaning for each card. You will find lists of meanings for
the Major Arcana and the Minor Arcana suits of Wands, Pentacles, Swords and Cups.
Have fun  
Liz Dean
P a g e | 2
  Liz Dean 2018
The Major Arcana
0 The Fool says: Look before you leap!
It s time for a new adventure, but there is a level of risk. Consider your options carefully,
and when you are sure, take that leap of faith.
Home: If you are a parent, The Fool can show a young person leaving home. Otherwise, it
predicts a sociable time, with lots of visitors   who may also help you with a new project.
Love and Relationships: A new path takes you towards love; this card often appears after a
break-up.
Career and Money: A great opportunity awaits. Seize it while you can.
Spiritual Development: New discoveries. You are finding your soul s path
Is he upside down? Beware fals

## Chunking

Download the embedding model from hugging face

In [9]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


We will be using the semantic Chunker from langchain

In [10]:
semantic_chunker = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")


In [11]:
chunks = semantic_chunker.split_text(text)



In [12]:
metadatas = [
    {
        "source": file_path,
        "chunk_index": i,
        "length": len(chunks[i])

    }
    for i in range(len(chunks))
]


In [13]:
print(f"Total number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} length: {len(chunk)} characters")
    print(f"Chunk {i+1}:\n{chunk}\n")

Total number of chunks: 29

Chunk 1 length: 1024 characters
Chunk 1:
Liz Dean 2018
Tarot Card Meanings
For easy reference and to help you get started with your readings, in the following pages I
have produced a short divinatory meaning for each card. You will find lists of meanings for
the Major Arcana and the Minor Arcana suits of Wands, Pentacles, Swords and Cups. Have fun  
Liz Dean
P a g e | 2
  Liz Dean 2018
The Major Arcana
0 The Fool says: Look before you leap! It s time for a new adventure, but there is a level of risk. Consider your options carefully,
and when you are sure, take that leap of faith. Home: If you are a parent, The Fool can show a young person leaving home. Otherwise, it
predicts a sociable time, with lots of visitors   who may also help you with a new project. Love and Relationships: A new path takes you towards love; this card often appears after a
break-up. Career and Money: A great opportunity awaits. Seize it while you can. Spiritual Development: New discove

## Vector Storage

In [14]:
db = Chroma(
    persist_directory="chroma_store",
    embedding_function=embedding_model
)

  db = Chroma(


In [15]:
docs = list(set(chunks))  
db.add_texts( texts=chunks, metadatas=metadatas)

['4a550f61-7dad-4195-a54b-cd27dd320c33',
 '2b4962ff-0a27-4a48-9e90-ccb20f505dae',
 'c9f518e0-c35b-42e5-86da-d8b43cbd9246',
 'eae30dfa-fecd-4c05-a842-6ec2f0a545d6',
 '2a3c7ed8-fd13-4824-a926-3c90aac5366a',
 'eb454e56-6f98-49eb-8169-7d4d90818455',
 '32aa5695-2ef3-4468-9f35-0bc135774c15',
 'fc308b9c-bce2-40c3-a223-28c225928480',
 '4b2afeb2-97b8-4190-8e2c-26f927a6ab28',
 '59a0f21f-c333-43f3-8554-fe3a1fcf0184',
 '354c6266-bd5d-474d-8312-77de0fbc4c60',
 '7143d928-509f-415a-ae79-fbe83de2b549',
 '994a56a1-e25c-486f-bdc1-e37de61f18d1',
 '66dcfe5c-ca99-42d0-aca7-b479552ef1b8',
 'ea79b486-038a-45e5-a78e-e7763772dd02',
 '17c74adc-9ae4-4691-9762-6415862377f7',
 'd8e012f5-6a4d-44a1-844b-7b63d6adf876',
 '5bfc2c0e-69bf-4111-a2d5-e5996d614e9d',
 '000a0ace-529d-472f-a1db-44a88f50acb6',
 '94424748-1821-43a7-8b4c-ba06fb2927ce',
 '012a7316-4a27-4ac4-9e7e-2beeac1a518b',
 '5978be88-e195-4f73-98a3-a9e1a107ed9f',
 'abba776b-fbf3-4d5a-8c65-f9ca93687cdb',
 'dbcbb038-8a85-466b-be60-980362ed33a5',
 '3133d197-dcbb-

In [16]:
db.persist()

  db.persist()


## General Query Answering

### Query Embedding

Embedd the query

In [17]:
query = "What do butterflies eat?"
query_embedding = embedding_model.embed_query(query)

### Similarity Search

In [18]:
# make the chroma store act like a retriever
retriever = db.as_retriever()

In [19]:
# perform the similarity search to get the most relevant chunks
results = retriever.get_relevant_documents(query)

  results = retriever.get_relevant_documents(query)


In [20]:

print(f"Query: {query}\n")
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.page_content}\n")
    print(f"Metadata: {result.metadata}\n")

Query: What do butterflies eat?

Result 1:
Pay attention to communication in relationships and you ll make progress - and know where
you stand. Page of Cups
A great time for children and young people, new projects and socializing. As a person, the Page
is kind and fun-loving - just what you need right now. Is he upside down? Having fun feels like hard work. Alternatively, your social circle is dominated by those who are
superficial and immature. Knight of Cups
A proposal; a potentially good opportunity. Look at the surrounding cards to see if this is a
genuine offer, or all talk. Is he upside down? Romantic schemes and dreams that don t have substance. As a person, this Knight is
selfobsessed and misleading; be skeptical. Queen of Cups
It s a time for giving and receiving love, and also creative expression. As a person, the Queen is
nurturing and sensitive. Is she upside down? Issues with mother-figures; in general, missing out on the support you deserve. As a person,
the reversed Quee

### Generating Response

In [21]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

# Wrap it with LangChain
llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


Now generating the final prompt by combining the query and the chunks

In [29]:

def trim_context_to_token_limit(docs, tokenizer, max_tokens):
    context = ""
    total_tokens = 0

    for doc in docs:
        doc_text = doc.page_content.strip() if hasattr(doc, 'page_content') else str(doc)
        doc_tokens = tokenizer(doc_text, return_tensors='pt', truncation=False)['input_ids'][0]
        if total_tokens + len(doc_tokens) <= max_tokens:
            context += doc_text + "\n\n"
            total_tokens += len(doc_tokens)
        else:
            break
    return context.strip()

In [23]:
context = trim_context_to_token_limit(results, tokenizer, 512)

Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors


In [24]:

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Use the following context to answer the question at the end. 
    Even if the question is not directly answered in the context, say "It's not clearly mentioned but my best guess is"
    and use the context to provide a guess.
    Give a detailed answer based on the context provided.
    Context:
    {context}

    Question:
    {question}

    Answer:"""
)


In [25]:
from langchain.chains import LLMChain

rag_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)


  rag_chain = LLMChain(


In [26]:

response = rag_chain.run({
    "context": context,
    "question": query
})

print(response)


  response = rag_chain.run({


Animals.


## Summarization

In [27]:
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
summarization_model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer_sum = AutoTokenizer.from_pretrained(summarization_model_name)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)

pipe_sum = pipeline(
    "text2text-generation",
    model=model_sum,
    tokenizer=tokenizer_sum,
    max_length=1024
)

# Wrap it with LangChain
llm_sum = HuggingFacePipeline(pipeline=pipe_sum)


Device set to use cpu


In [30]:
def get_summary(vectorstore=None, llm_sum=None, k=20):

    collection = vectorstore._collection
    all_docs = collection.get(include=["documents", "embeddings"])
    
    embeddings = np.array(all_docs["embeddings"])
    documents = all_docs["documents"]

    centroid = np.mean(embeddings, axis=0).reshape(1, -1)
    salience_scores = np.linalg.norm(embeddings - centroid, axis=1)

    lower_bound = np.percentile(salience_scores, 5)
    upper_bound = np.percentile(salience_scores, 86)

    mid_band_indices = [
        i for i, score in enumerate(salience_scores)
        if lower_bound < score < upper_bound
    ]

    mid_band_indices.sort(key=lambda i: salience_scores[i], reverse=True)
    k = min(k, len(mid_band_indices))
    salient_chunks = [documents[i] for i in mid_band_indices[:k]]
    # if a token is larger than 500 characters, get rid of it
    salient_chunks = [chunk for chunk in salient_chunks if len(chunk) <= 500]

    print(f"Number of chunks: {len(salient_chunks)}\n")
    text = trim_context_to_token_limit(
        salient_chunks, tokenizer_sum, 1024
    )
    summary = llm_sum.invoke(text)
    
    return summary



print(
    get_summary(db, llm_sum)
)



Number of chunks: 6

 Wands are associated with the element of fire and represent heat, passion and creativity . Ace of Wands: New work and relationships; a time to travel, and to be creative and inspired . Ten of Cups: Happiness, peace, and time for loved ones . Ace: Time to travel and encounter exciting new opportunities will come .


## Empty Collection

In [31]:
db.delete_collection()