In [50]:
# !pip install qdrant_client
# !pip install --quiet langchain_experimental
# %pip install -qU langchain-text-splitters

In [51]:
import pymupdf
import datetime
import os

from qdrant_client import QdrantClient, models
from langchain_openai import OpenAIEmbeddings
import asyncio
from qdrant_client.http.models import PointStruct
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [52]:
q_client = QdrantClient(url='https://1fbc28ed-4ddf-46fc-bc6c-4c0a7b3c1ea4.us-east4-0.gcp.cloud.qdrant.io', port=6333, api_key=os.getenv('QDRANT_API_KEY', ''), timeout=30000)

In [53]:
# This import is required only for jupyter notebooks, since they have their own eventloop
import nest_asyncio
nest_asyncio.apply()

In [54]:
embeddings_model = OpenAIEmbeddings()

In [55]:
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [56]:
pdf_text = extract_text_from_pdf('tata_motors_report.pdf')

In [57]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=80,
    length_function=len,
    is_separator_regex=False,
)

In [58]:
docs = text_splitter.create_documents([pdf_text])

In [59]:
docs

[Document(page_content='ICICI Securities – Retail Equity Research \n \nResult Update \nMay 13, 2024 \nCMP: ₹ 1,000     \nTarget: ₹ 1,120  (12%)  \nTarget Period: 12 months  \nTata Motors (TATMOT) \n \nHOLD \n  \nFiring on all cylinders, record margins across divisions \nAbout the stock: Tata Motors (TML) is an auto OEM from the house of Tata’s, \noperating in domestic (PV, CV) and global markets (Jaguar Land Rover i.e., JLR)  \n• \nFY24 consolidated sales mix– JLR ~69%, India CV & PV combined ~30%. \n•'),
 Document(page_content='• \nFY24 consolidated sales mix– JLR ~69%, India CV & PV combined ~30%. \n• \nTML India: CV market share: 38%; PV market share ~14% as of FY24 \nQ4FY24 Results: Tata Motors reported robust Q4FY24 results. Consolidated \ntopline stood at ₹1.2 lakh crore (up 13%YoY) with EBITDA at ~₹20,247 crore and \nEBITDA margins at 16.9%, up 50 bps QoQ. JLR reported EBITDA & EBIT margins of \n16.3% (up 10 bps QoQ) & 9.2% (up 50 bps QoQ) respectively. Indian CV business'),
 Do

In [60]:
embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])

In [61]:
q_client.recreate_collection(collection_name='financial_reports', vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE))

  q_client.recreate_collection(collection_name='financial_reports', vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE))


True

In [62]:
q_client.upsert(collection_name='financial_reports', points=[PointStruct(id=idx, vector=embeddings[idx], payload={"content": doc.page_content}) for idx, doc in enumerate(docs)])

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [64]:
q_client.search(collection_name='financial_reports', query_vector=embeddings_model.embed_documents(["target price tata motors"])[0], limit=10)

[ScoredPoint(id=0, version=0, score=0.8648046, payload={'content': 'ICICI Securities – Retail Equity Research \n \nResult Update \nMay 13, 2024 \nCMP: ₹ 1,000     \nTarget: ₹ 1,120  (12%)  \nTarget Period: 12 months  \nTata Motors (TATMOT) \n \nHOLD \n  \nFiring on all cylinders, record margins across divisions \nAbout the stock: Tata Motors (TML) is an auto OEM from the house of Tata’s, \noperating in domestic (PV, CV) and global markets (Jaguar Land Rover i.e., JLR)  \n• \nFY24 consolidated sales mix– JLR ~69%, India CV & PV combined ~30%. \n•'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=7, version=0, score=0.85880244, payload={'content': 'leadership position in E-PV space with target to sell ~1 lakh E-PVs in FY25E \nand is also a prominent player winning orders in CESL E-bus tenders  \nRating and Target Price \n• \nTata Motors is reporting record performance across all its segments, \nhowever due to lack of near-term triggers we retain our HOLD rating on \nthe 