<a href="https://colab.research.google.com/github/duyguhalisyama1/LLM/blob/main/pdf_embeddings_table_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import (
    Pinecone, ServerlessSpec, CloudProvider, AwsRegion, Metric, VectorType
)
from unstructured.partition.pdf import partition_pdf
from groq import Groq
import fitz

import os
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()


# API Anahtarlarƒ±
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Model ve istemciler
client = Groq(api_key=GROQ_API_KEY)
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)

# üì• PDF'den t√ºm metni √ßek
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# üìä Tablo a√ßƒ±klamasƒ± + markdown formatƒ± √ºret
def enrich_table_with_context(table_content, document_context):
    prompt = f"""
    Given the following table and its context from the original document,
    provide a detailed description of the table. Then, include the table in markdown format.

    Original Document Context:
    {document_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    2. The table in markdown format.
    """

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes tables and formats them in markdown."},
            {"role": "user", "content": prompt},
        ],
        model="llama3-70b-8192"
    )
    return chat_completion.choices[0].message.content

# üß† Metni embedding'e d√∂n√º≈üt√ºr
def get_embedding(text):
    return embedding_model.encode(text).tolist()

# üß± Pinecone index ba≈ülat
def init_pinecone(index_name, dimension=768):
    pc = Pinecone(api_key=PINECONE_API_KEY)
    indexes = pc.list_indexes()  # Mevcut index'leri alƒ±r

    # Eƒüer index mevcutsa, doƒürudan mevcut index'e baƒülan
    if index_name in indexes:
        print(f"üîç Var olan Pinecone index'e baƒülanƒ±lƒ±yor: {index_name}")
        return pc.Index(index_name)

    try:
        # Index yoksa, olu≈üturmayƒ± dene
        index_config = pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=Metric.COSINE,  # Vekt√∂r kar≈üƒ±la≈ütƒ±rma metrik olarak COSINE kullanƒ±lƒ±yor
            spec=ServerlessSpec(
                cloud=CloudProvider.AWS,
                region=AwsRegion.US_EAST_1  # ƒ∞htiyacƒ±nƒ±za g√∂re b√∂lgeyi deƒüi≈ütirebilirsiniz
            ),
            vector_type=VectorType.DENSE
        )
        print(f"‚úÖ Yeni Pinecone index olu≈üturuldu: {index_name}")
        return pc.Index(host=index_config.host)

    except Exception as e:
        # Eƒüer "ALREADY_EXISTS" hatasƒ± alƒ±rsak, mevcut index'e baƒülanƒ±yoruz.
        if "ALREADY_EXISTS" in str(e):
            return pc.Index(index_name)
        else:
            raise e

# üì§ Pinecone'a i√ßerikleri g√∂nder
def upsert_elements(index, elements):
    vectors = []
    for i, el in enumerate(elements):
        if el.text:
            embedding = get_embedding(el.text)
            vectors.append((f"element-{i}", embedding, {"content": el.text}))
    index.upsert(vectors=vectors, namespace="doc_namespace")
    print(f"üìå {len(vectors)} i√ßerik Pinecone'a y√ºklendi.")

# üîé Pinecone'dan bilgi al
def retrieve(index, query, top_k=3):
    query_embedding = get_embedding(query)
    res = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        namespace="doc_namespace"
    )
    return "\n\n---\n\n".join([match["metadata"]["content"] for match in res["matches"]]) if res["matches"] else "No relevant context found."

# üß† LLM ile cevap olu≈ütur
def complete(prompt):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            model="llama3-70b-8192"
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Groq API Error: {str(e)}"

# üöÄ Ana S√ºre√ß
def main():
    pdf_path = "bav99.pdf"
    excel_file = 'Ek-3 Temel √ñznitelik Tanƒ±mlarƒ±.xlsx'
    filter_value = 'Rectifiers'
    index_name = "pdf-embeddings-table-context"

    print("üìÑ PDF i√ßeriƒüi i≈üleniyor...")
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        chunking_strategy="by_title"
    )
    document_context = extract_text_from_pdf(pdf_path)

    print("üîç Tablo √∂ƒüeleri baƒülamsal olarak zenginle≈ütiriliyor...")
    for element in elements:
        if element.category == 'Table':
            table_content = element.text
            enriched_text = enrich_table_with_context(table_content, document_context)
            element.text = enriched_text

    print("üìå Pinecone index ba≈ülatƒ±lƒ±yor...")
    index = init_pinecone(index_name)
    index.delete(delete_all=True, namespace="doc_namespace")

    print("üîó Embedding'ler olu≈üturuluyor ve Pinecone'a g√∂nderiliyor...")
    upsert_elements(index, elements)

    # √ñrnek sorgular (Excel'e baƒülƒ± alan tarama)
    df = pd.read_excel(excel_file)
    fields = df[df.iloc[:, 6] == filter_value].iloc[:, 8].dropna().tolist()

    for field in fields:
        query_prompt = f"Find the exact value for: {field} in the datasheet."
        retrieved_context = retrieve(index, query_prompt)

        full_prompt = f"""
        Answer the question based on the context below. Extract ONLY the exact numerical value and unit OR specified term.
        If the value is not available in the given context, return "Not Available" and nothing else.
        Consider synonyms, abbreviations, and variations of the term while searching.

        Context:
        {retrieved_context}

        Question: What is the exact value for {field} (including any synonymous terms or variations)?

        Answer:
        """
        answer = complete(full_prompt)
        print(f"{field}: {answer}")

    print("üßπ Pinecone temizleniyor...")
    #index.delete(delete_all=True, namespace="doc_namespace")

if __name__ == "__main__":
    main()


GroqError: The api_key client option must be set either by passing api_key to the client or by setting the GROQ_API_KEY environment variable

In [15]:
%pip install tools
!pip install python-dotenv
!apt-get install poppler-utils



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 186 kB in 1s (234 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126332 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.7_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.7) ...
Setting up poppler-utils (22.02.0-2ubuntu0.7) ...
Processing triggers for man-db (2.10.2-1) ...


In [9]:
!git clone https://github.com/duyguhalisyama1/LLM.git
%cd LLM

Cloning into 'LLM'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 17 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (17/17), 3.69 MiB | 16.27 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/LLM
