In [1]:
!git clone https://github.com/duyguhalisyama1/LLM.git
%cd LLM

Cloning into 'LLM'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 33 (delta 10), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (33/33), 3.70 MiB | 17.05 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/LLM


In [2]:
%pip install tools
%pip install dotenv
%pip install groq
%pip install pandas
%pip install pi_heif
%pip install unstructured_inference
%pip install pdf2image
%pip install unstructured_pytesseract
%pip install pymupdf
%pip install unstructured
%pip install pdfminer.six==20221105
%pip install PyMuPDF
%pip install huggingface_hub[hf_xet]
!pip install "camelot-py[cv]"

%pip install pinecone
!pip install python-dotenv
!apt-get install poppler-utils



Collecting tools
  Downloading tools-1.0.2-py3-none-any.whl.metadata (1.4 kB)
Downloading tools-1.0.2-py3-none-any.whl (37 kB)
Installing collected packages: tools
Successfully installed tools-1.0.2
Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0
Collecting groq
  Downloading groq-0.24.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.24.0-py3-none-any.whl (127 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.24.0
Collecting pi_heif
  Downloading pi_heif-0.22.0-cp311-cp311-man

In [7]:
import os
import certifi
from dotenv import load_dotenv
import pandas as pd
import pymupdf
from unstructured.partition.pdf import partition_pdf
import camelot
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, Metric, VectorType
from groq import Groq

# --- Load environment & clients ---------------------------------
load_dotenv()
# ensure valid SSL for requests
os.environ['SSL_CERT_FILE'] = certifi.where()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# initialize clients
llm_client = Groq(api_key=GROQ_API_KEY)
embed_model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)

# --- PDF extraction ------------------------------------------------

def extract_full_text(pdf_path: str) -> str:
    """Extracts and returns full PDF text via PyMuPDF."""
    text = []
    with pymupdf.open(pdf_path) as doc:
        for page in doc:
            text.append(page.get_text())
    return "\n".join(text)

# --- Table extraction ------------------------------------------------

def extract_tables_unstructured(pdf_path: str) -> list[str]:
    """Extract table blocks via unstructured.partition.pdf."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        chunking_strategy="by_title"
    )
    return [el.text for el in elements if el.category == 'Table']


def extract_tables_camelot(pdf_path: str, flavor: str = 'lattice') -> list[str]:
    """Extract tables via Camelot; return each as markdown string."""
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
    md_tables = []
    for table in tables:
        df = table.df
        md_tables.append(df.to_markdown(index=False))
    return md_tables

# --- Embedding & Pinecone -------------------------------------------

def get_embedding(text: str) -> list[float]:
    """Compute text embedding."""
    return embed_model.encode(text).tolist()


def init_pinecone_index(name: str, dim: int = 768):
    """Initialize or connect to a Pinecone index."""
    pc = Pinecone(api_key=PINECONE_API_KEY)
    indexes = pc.list_indexes()  # Mevcut index'leri alır

    # Eğer index mevcutsa, doğrudan mevcut index'e bağlan
    if name in indexes:
        print(f"🔍 Var olan Pinecone index'e bağlanılıyor: {name}")
        return pc.Index(name)

    try:
        # Index yoksa, oluşturmayı dene
        index_config = pc.create_index(
            name=name,
            dimension=dim,
            metric=Metric.COSINE,  # Vektör karşılaştırma metrik olarak COSINE kullanılıyor
            spec=ServerlessSpec(
                cloud=CloudProvider.AWS,
                region=AwsRegion.US_EAST_1  # İhtiyacınıza göre bölgeyi değiştirebilirsiniz
            ),
            vector_type=VectorType.DENSE
        )
        print(f"✅ Yeni Pinecone index oluşturuldu: {name}")
        return pc.Index(host=index_config.host)

    except Exception as e:
        # Eğer "ALREADY_EXISTS" hatası alırsak, mevcut index'e bağlanıyoruz.
        if "ALREADY_EXISTS" in str(e):
            return pc.Index(name)
        else:
            raise e


def upsert_embeddings(index, items: list[tuple[str, str]] , namespace: str = 'doc_namespace'):
    """Upsert list of (id, text) into Pinecone."""
    vectors = []
    for uid, text in items:
        if not text.strip():
            continue
        emb = get_embedding(text)
        vectors.append((uid, emb, {'text': text}))
    if vectors:
        index.upsert(vectors=vectors, namespace=namespace)


def retrieve_context(index, query: str, top_k: int = 3, namespace: str = 'doc_namespace') -> str:
    """Retrieve top_k text chunks matching query."""
    q_emb = get_embedding(query)
    res = index.query(vector=q_emb, top_k=top_k, include_metadata=True, namespace=namespace)
    matches = res.get('matches', [])
    return "\n---\n".join(m['metadata']['text'] for m in matches)

# --- LLM-based table enrichment -------------------------------------

def enrich_table_with_llm(table_md: str, context: str) -> str:
    """Use LLM to describe a single table with its context."""
    prompt = f"""
Given the following table from the document, provide the table in markdown format.

Table Content:
    {table_md}

    Please provide the table in markdown format.

"""
    resp = llm_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that formats tables in markdown."},
            {"role": "user", "content": prompt}
        ],
    )
    return resp.choices[0].message.content


# --- PDF QA via LLM -------------------------------------------------

def answer_query_with_llm(query: str, context: str) -> str:
    """Answer a user query given retrieved context via LLM."""
    prompt = f"""
You will be given context below.

Your task:
  1. Find exactly that field in the context. Consider synonyms, abbreviations of the term while searching.
  2. Extract ONLY the exact numerical value and unit OR specified term.
  3. Do NOT add any prefixes, explanations, or extra keys.
  4. If you can’t find it, return exactly: Not Available

Context:
{context}

Question: {query}
"""
    resp = llm_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": "You are a precise data-extraction assistant."},
            {"role": "user", "content": prompt}
        ],
    )
    return resp.choices[0].message.content.strip()

Average Rectified Forward Current: 150 mA
Cathode Polarity: Not Available
Configuration: Not Available
Material: Plastic
Maximum Continuous Forward Current: Not Available
Maximum DC Reverse Voltage: 100 V
Maximum Diode Capacitance: 1.5 pF
Maximum Junction Ambient Thermal Resistance: 430 K/W
Maximum Junction Case Thermal Resistance: Not Available
Maximum Operating Temperature: 150 °C
Maximum Power Dissipation: 300 mw
Maximum RMS Reverse Voltage: Not Available
Maximum Storage Temperature: +150 °C
Minimum Operating Temperature: -55°C
Minimum Storage Temperature: -55 °C
Operating Junction Temperature: -55...+150 °C
Peak Forward Voltage: Not Available
Peak Non-Repetitive Surge Current: 1 A
Peak Reverse Current: Not Available
Peak Reverse Recovery Time: 6 ns
Peak Reverse Repetitive Voltage: 70 V
Process Technology: Not Available
Repetitive Peak Forward Current: Not Available
Speed: Not Available
Supplier Temperature Grade: Not Available
Tradename: BAV99
Type: SOT23
Typical Junction Capacitan

In [None]:
# --- Main pipeline ---------------------------------------------------
def run_pipeline(
    pdf_path: str,
    index_name: str,
    excel_path: str,
    excel_filter: str = None
) -> dict[str, str]:

    # Extract
    full_text = extract_full_text(pdf_path)
    tables_unst = extract_tables_unstructured(pdf_path)
    tables_cam = extract_tables_camelot(pdf_path)

    # Deduplicate but keep each table independent
    all_tables = []
    seen = set()
    for tbl in tables_unst + tables_cam:
        key = tbl.strip()
        if key and key not in seen:
            seen.add(key)
            all_tables.append(tbl)

    # Initialize Pinecone and index
    idx = init_pinecone_index(index_name)
    idx.delete(delete_all=True, namespace="doc_namespace")

    # Upsert full-text chunks as separate docs
    full_chunks = full_text.split('\n\n')
    upsert_embeddings(idx, [(f'txt-{i}', chunk) for i, chunk in enumerate(full_chunks)])

    # Upsert each table as separate item
    upsert_embeddings(idx, [(f'tbl-{i}', tbl) for i, tbl in enumerate(all_tables)])

    # Optionally enrich tables (token-safe since context is full_text)
    enriched_tables = [enrich_table_with_llm(tbl, full_text) for tbl in all_tables]

    # Excel lookup & QA
    df = pd.read_excel(excel_path)
    if excel_filter:
        fields = df[df.iloc[:, 6] == excel_filter].iloc[:, 8].dropna().tolist()
    else:
        fields = df.iloc[:, 8].dropna().tolist()

    answers = {}
    for field in fields:
        ctx = retrieve_context(idx, field, top_k=3)
        ans = answer_query_with_llm(field, ctx)
        answers[field] = ans


    return answers

if __name__ == "__main__":
    results = run_pipeline(
        pdf_path="/content/LLM/bav99.pdf",
        index_name="pdf-embeddings-table-context",
        excel_path="/content/LLM/Ek-3 Temel Öznitelik Tanımları.xlsx",
        excel_filter="Rectifiers"
    )
    for k, v in results.items():
        print(f"{k}: {v}")