<a href="https://colab.research.google.com/github/duyguhalisyama1/LLM/blob/main/pdf_embeddings_table_llm_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import (
    Pinecone, ServerlessSpec, CloudProvider, AwsRegion, Metric, VectorType
)
from unstructured.partition.pdf import partition_pdf
from groq import Groq
import fitz, pymupdf

import os
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()


# API Anahtarları
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Model ve istemciler
client = Groq(api_key=GROQ_API_KEY)
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)

# 📥 PDF'den tüm metni çek
def extract_text_from_pdf(pdf_path):
    text = ""
    with pymupdf.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# 📊 Tablo açıklaması + markdown formatı üret
def enrich_table_with_context(table_content, document_context):
    prompt = f"""
    Given the following table and its context from the original document,
    provide a detailed description of the table. Then, include the table in markdown format.

    Original Document Context:
    {document_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    2. The table in markdown format.
    """

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes tables and formats them in markdown."},
            {"role": "user", "content": prompt},
        ],
        model="llama3-70b-8192"
    )
    return chat_completion.choices[0].message.content

# 🧠 Metni embedding'e dönüştür
def get_embedding(text):
    return embedding_model.encode(text).tolist()

# 🧱 Pinecone index başlat
def init_pinecone(index_name, dimension=768):
    pc = Pinecone(api_key=PINECONE_API_KEY)
    indexes = pc.list_indexes()  # Mevcut index'leri alır

    # Eğer index mevcutsa, doğrudan mevcut index'e bağlan
    if index_name in indexes:
        print(f"🔍 Var olan Pinecone index'e bağlanılıyor: {index_name}")
        return pc.Index(index_name)

    try:
        # Index yoksa, oluşturmayı dene
        index_config = pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=Metric.COSINE,  # Vektör karşılaştırma metrik olarak COSINE kullanılıyor
            spec=ServerlessSpec(
                cloud=CloudProvider.AWS,
                region=AwsRegion.US_EAST_1  # İhtiyacınıza göre bölgeyi değiştirebilirsiniz
            ),
            vector_type=VectorType.DENSE
        )
        print(f"✅ Yeni Pinecone index oluşturuldu: {index_name}")
        return pc.Index(host=index_config.host)

    except Exception as e:
        # Eğer "ALREADY_EXISTS" hatası alırsak, mevcut index'e bağlanıyoruz.
        if "ALREADY_EXISTS" in str(e):
            return pc.Index(index_name)
        else:
            raise e

# 📤 Pinecone'a içerikleri gönder
def upsert_elements(index, elements):
    vectors = []
    for i, el in enumerate(elements):
        if el.text:
            embedding = get_embedding(el.text)
            vectors.append((f"element-{i}", embedding, {"content": el.text}))
    index.upsert(vectors=vectors, namespace="doc_namespace")
    print(f"📌 {len(vectors)} içerik Pinecone'a yüklendi.")

# 🔎 Pinecone'dan bilgi al
def retrieve(index, query, top_k=3):
    query_embedding = get_embedding(query)
    res = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        namespace="doc_namespace"
    )
    return "\n\n---\n\n".join([match["metadata"]["content"] for match in res["matches"]]) if res["matches"] else "No relevant context found."

# 🧠 LLM ile cevap oluştur
def complete(prompt):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            model="llama3-70b-8192"
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Groq API Error: {str(e)}"

# 🚀 Ana Süreç
def main():
    pdf_path = "/content/LLM/bav99.pdf"
    excel_file = '/content/LLM/Ek-3 Temel Öznitelik Tanımları.xlsx'
    filter_value = 'Rectifiers'
    index_name = "pdf-embeddings-table-context"

    print("📄 PDF içeriği işleniyor...")
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        chunking_strategy="by_title"
    )
    document_context = extract_text_from_pdf(pdf_path)

    print("🔍 Tablo öğeleri bağlamsal olarak zenginleştiriliyor...")
    for element in elements:
        if element.category == 'Table':
            table_content = element.text
            enriched_text = enrich_table_with_context(table_content, document_context)
            element.text = enriched_text

    print("📌 Pinecone index başlatılıyor...")
    index = init_pinecone(index_name)
    #index.delete(delete_all=True, namespace="doc_namespace")

    print("🔗 Embedding'ler oluşturuluyor ve Pinecone'a gönderiliyor...")
    upsert_elements(index, elements)

    # Örnek sorgular (Excel'e bağlı alan tarama)
    df = pd.read_excel(excel_file)
    fields = df[df.iloc[:, 6] == filter_value].iloc[:, 8].dropna().tolist()

    for field in fields:
        query_prompt = f"Find the exact value for: {field} in the datasheet."
        retrieved_context = retrieve(index, query_prompt)

        full_prompt = f"""
        Answer the question based on the context below. Extract ONLY the exact numerical value and unit OR specified term.
        If the value is not available in the given context, return "Not Available" and nothing else.
        Consider synonyms, abbreviations, and variations of the term while searching.

        Context:
        {retrieved_context}

        Question: What is the exact value for {field} (including any synonymous terms or variations)?

        Answer:
        """
        answer = complete(full_prompt)
        print(f"{field}: {answer}")

    print("🧹 Pinecone temizleniyor...")
    #index.delete(delete_all=True, namespace="doc_namespace")

if __name__ == "__main__":
    main()


📄 PDF içeriği işleniyor...
🔍 Tablo öğeleri bağlamsal olarak zenginleştiriliyor...
📌 Pinecone index başlatılıyor...
🔗 Embedding'ler oluşturuluyor ve Pinecone'a gönderiliyor...
📌 14 içerik Pinecone'a yüklendi.
Average Rectified Forward Current: 150 mA
Cathode Polarity: Not Available
Configuration: Not Available
Material: Not Available
Maximum Continuous Forward Current: 250 mA
Maximum DC Reverse Voltage: 70 V
Maximum Diode Capacitance: 1.5 pF
Maximum Junction Ambient Thermal Resistance: 430 K/W
Maximum Junction Case Thermal Resistance: 430 K/W
Maximum Operating Temperature: +150 °C
Maximum Power Dissipation: 300 mW
Maximum RMS Reverse Voltage: Not Available
Maximum Storage Temperature: +150 °C
Minimum Operating Temperature: -55 °C
Minimum Storage Temperature: -55
Operating Junction Temperature: -55...+150°C
Peak Forward Voltage: Not Available
Peak Non-Repetitive Surge Current: 1 A
Peak Reverse Current: 2.5 μA
Peak Reverse Recovery Time: 6 ns
Peak Reverse Repetitive Voltage: Not Available

In [None]:
%pip install tools
%pip install dotenv
%pip install groq
%pip install pandas
%pip install pi_heif
%pip install unstructured_inference
%pip install pdf2image
%pip install unstructured_pytesseract
%pip install pymupdf
%pip install unstructured
%pip install pdfminer.six==20221105
%pip install PyMuPDF

%pip install pinecone
!pip install python-dotenv
!apt-get install poppler-utils



In [1]:
!git clone https://github.com/duyguhalisyama1/LLM.git
%cd LLM

Cloning into 'LLM'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 30 (delta 8), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 3.70 MiB | 9.53 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/LLM


In [None]:
import fitz
print(fitz.__file__)
print(dir(fitz))  # 'open' ve/veya 'Document' görmelisin


/usr/local/lib/python3.11/dist-packages/fitz/__init__.py
['__author__', '__author_email__', '__builtins__', '__cached__', '__doc__', '__downloadUrl__', '__file__', '__license__', '__loader__', '__maintainer_email__', '__name__', '__package__', '__path__', '__spec__', '__url__', '__version__', 'op', 'tools']
