### Inserting Text Chunks into Pinecone

In [None]:
from unstructured.partition.pdf import partition_pdf
import pdfplumber


def process_pdf(pdf_path):
    # Extract text content
    elements = partition_pdf(filename=pdf_path)
    text_content = "\n".join([elem.text for elem in elements if elem.text])

    # Extract tables
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table:
                    tables.append(table)
            except Exception as e:
                print(f"Error extracting table: {e}")

    return text_content, tables


pdf_path = "NetSol_Financial_Statement_2024_Part_1.pdf"
text_content, tables = process_pdf(pdf_path)

In [None]:
# SAve this text_content and tables in a file that can be read quickly
import json
with open("text_content.txt", "w") as f:
    f.write(text_content)
print(tables)
print(text_content)
with open("tables.json", "w") as f:
    json.dump(tables, f)

In [None]:
import json
# Load the file text_content.txt and tables.json
with open("text_content.txt", "r") as f:
    text_content = f.read()

with open("tables.json", "r") as f:
    tables = json.load(f)

def clean_text(text):
    # Remove extra whitespace, newlines, etc.
    return " ".join(text.split())

def process_tables(tables):
    formatted_tables = []
    for table in tables:
        if table:
            headers = table[0]
            rows = table[1:]
            table_text = "Table:\n"
            if len(rows) == 0 or len(headers) == 0:
                continue
            
            for row in rows:
                table_text += " | ".join(f"{header}: {value}" for header, value in zip(headers, row)) + "\n"
            formatted_tables.append(table_text)
    return "\n".join(formatted_tables)


# Clean text content and tables
text_content = clean_text(text_content)
table_text = process_tables(tables)

# Pretty print this table_text
print(table_text)


Table:
Table of Content: About Us 59
Company Profile 4 Auditors’ Report to the Members 66
Vision & Mission 6 Statement of Compliance 67
Major Customers 8 Pattern of Shareholding 69
Global Offices 9 Information Required as per Code of 70
Corporate Governance
Quality Focus 10
Our Portfolio 11 NETSOL & Sustainability
Development Goals
Core Products 12
Core Services 13 NETSOL & Sustainability Development Goals 72
Our Leadership 14 Notice of Annual General
Meeting
Business Review
Notice of Annual General Meeting 80
Chairman’s Review Report 19
Financial Statements
CEO’s Message 22
Awards and Recognition 24 Auditors’ Report to the Members 85
Global Marketing Activities 26 Statement of Financial Position 90
Activities During the Year 28 Statement of Profit or Loss 92
Share Price/Volume 30 Comprehensive Income 93
Shareholders’ Information 31 Statement of Cash Flows 94
Changes in Equity 95
Financial Highlights
Notes to Finacial Statements 96
Statement of Value Addition 34
Consolidated Financial


In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

load_dotenv()

pine_api_key = os.getenv("PINE_API_KEY")


def chunk_text(text, chunk_size=800, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter.split_text(text)

chunks_text = chunk_text(text_content, chunk_size=800, chunk_overlap=100)
chunks_table = chunk_text(table_text, chunk_size=800, chunk_overlap=100)
chunks = chunks_text + chunks_table
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
pc = Pinecone(
    api_key=pine_api_key
)

pc.create_index(name='netsol-finance-asm4', dimension=384, metric='cosine', spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )
index = pc.Index('netsol-finance-asm4')
def generate_embeddings(chunks):
    return embedding_model.encode(chunks)

# Store chunks and embeddings in Pinecone
def store_chunks_in_pinecone(chunks, index):
    embeddings = generate_embeddings(chunks)
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        index.upsert([(f"chunk-{i}", embedding.tolist(), {"text": chunk})])

# Example usage
store_chunks_in_pinecone(chunks, index)


NameError: name 'text_content' is not defined