# 1. bibliothèques

In [8]:
from huggingface_hub import login 
login("")

In [1]:
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
from typing import Dict, List
from groq import Groq
from sklearn.metrics import accuracy_score, f1_score, classification_report

2025-10-09 10:46:16.378132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759999576.401091 1434550 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759999576.408243 1434550 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-09 10:46:16.431909: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 2.Data

Ce script charge le corpus JSON brut, extrait les textes, les tables et les triplets d’annotations, puis les organise dans une structure unifiée.

In [2]:
with open("/projects/melodi/mettaleb/Annotation/corpus_challenge/test/F2_nous.json", "r", encoding="utf-8") as f:
    data = json.load(f)

d = {}

important_keys = ['Company type', 'Industry', 'Founded', 'Founder', 'Headquarters']

for idx, doc in enumerate(data.get("documents", [])):
    texts = []
    extraction_meta = doc.get("raw", {}).get("_source", {}).get("extractionMetadata", [])
    for meta in extraction_meta:
        for t in meta.get("texts", []):
            texts.append(t.get("value", ""))
    texts = " ".join(texts).strip()

    tables = []
    for meta in extraction_meta:
        for tbl in meta.get("tables", []):
            table_data = tbl.get("tableData", [])
            cond1 = all(len(row) == 2 for row in table_data)
            cond2 = len(table_data) > 0 and table_data[0] == ['0', '1']
            cond3 = any(row[0] in important_keys for row in table_data[1:])

            if cond1 and cond2 and cond3:
                headers = [row[0] for row in table_data[1:]]
                values = [row[1] for row in table_data[1:]]
                new_table = {"tableData": [headers, values]}
                tables.append(new_table)
            else:
                tables.append({"tableData": table_data})

    triplets_list = []
    for ann in doc.get("annotations", []):
        subj = ann.get("subject", {}).get("annotationValue", "")
        obj = ann.get("object", {}).get("annotationValue", "")
        pred_val = ann.get("predicate", {}).get("entityValue", "")
        if pred_val.lower() == "pertinence":
            continue
        triplet_str = f"{subj} ; {obj} ; {pred_val}"
        triplets_list.append(triplet_str)

    triplets = " | ".join(triplets_list)

    d[idx] = [texts, tables, triplets]

#for k, v in list(d.items())[:5]:
#    print(f"Doc {k}:")
 #   print("  Text:", v[0][:100], "...")
  #  print("  Nb tables:", len(v[1]))
   # for t in v[1]:
    #    print("  Table:", t)
    #print("  Triplets:", v[2])
   # print("-" * 50)


#### Convertit une table au format structurée CSV-like (string).
Transforme une table extraite du corpus en un format lisible de type CSV, en normalisant les en-têtes et les lignes afin de faciliter son intégration dans les prompts des LLMs.

In [3]:
def table_to_csvlike(table_dict):
    table = table_dict.get("tableData", [])
    if not table:
        return ""
    headers = table[0]
    headers = [h.strip() if h.strip() else f"Col{i+1}" for i, h in enumerate(headers)]
    rows = table[1:]
    csv_lines = [" | ".join(headers)]
    for row in rows:
        row_extended = row + [""] * (len(headers) - len(row))
        csv_lines.append(" | ".join(row_extended))
    
    return "\n".join(csv_lines)


## Liste des relations

In [4]:
relations = ['acquired_by','brand_of', 'client_of', 'collaboration', 'competitor_of', 'merged_with', 'product_or_service_of', 'regulated_by', 'shareholder_of', 'subsidiary_of', 'traded_on']

# 3.Prompt Engineering

Ce module définit les fonctions nécessaires pour interagir avec différents modèles de langage (en utilisant Groq ou Hugging Face), en configurant les appels API, les paramètres de génération (température, top-p, etc.) et les formats de messages

## Avec Groq

In [109]:
api_key = ""

In [110]:
client = Groq(api_key= api_key)
#DEFAULT_MODEL = "llama-3.3-70b-versatile"
DEFAULT_MODEL = "deepseek-r1-distill-llama-70b"
#DEFAULT_MODEL = "meta-llama/llama-4-maverick-17b-128e-instruct"
def assistant(content: str):
    return { "role": "assistant", "content": content }

def user(content: str):
    return { "role": "user", "content": content }

def chat_completion(
    messages: List[Dict],
    model = DEFAULT_MODEL,
    temperature: float = 0.1,
    top_p: float = 0.2,
) -> str:
    response = client.chat.completions.create(
        messages=messages,
        model=model,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message.content
        

def completion(
    prompt: str,
    model: str = DEFAULT_MODEL,
    temperature: float = 0.1,
    top_p: float = 0.2,
) -> str:
    return chat_completion(
        [user(prompt)],
        model=model,
        temperature=temperature,
        top_p=top_p,
    )

def complete_and_print(prompt: str, model: str = DEFAULT_MODEL):
    #print(f'==============\n{prompt}\n==============')
    response = completion(prompt, model)
    #print(response, end='\n\n')
    return response

## Avec HuggingFace

In [9]:
#DEFAULT_MODEL = "deepseek-ai/DeepSeek-V3.2-Exp"
#DEFAULT_MODEL = "deepseek-ai/DeepSeek-R1"
DEFAULT_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
#DEFAULT_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
#DEFAULT_MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

CACHE_DIR = "/projects/melodi/mettaleb/huggingface_cache"

dtype = torch.float16 if torch.cuda.is_available() else torch.float32

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, cache_dir=CACHE_DIR)

model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    cache_dir=CACHE_DIR,
    attn_implementation="flash_attention_2",
).eval()

torch.backends.cuda.matmul.allow_tf32 = True

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [18]:

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


def assistant(content: str) -> Dict:
    return {"role": "assistant", "content": content}

def user(content: str) -> Dict:
    return {"role": "user", "content": content}


def chat_completion(
    messages: List[Dict],
    max_new_tokens: int = 256,
    temperature: float = 0.1,
    top_p: float = 0.2,
) -> str:
    
    prompt = ""
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        prompt += f"{role.upper()}:\n{content}\n\n"

    output = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True
    )

    return output[0]["generated_text"]


def completion(
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.1,
    top_p: float = 0.1
) -> str:
    return chat_completion([user(prompt)], max_new_tokens, temperature, top_p)

def complete_and_print(prompt: str):
    response = completion(prompt)
    print(response)
    return response


Device set to use cuda:0


## 3.1 Zero-shot Learning

### Prompts

In [9]:
def recupere_prompt(num_prompt, i):
    if num_prompt == 1:
        Prompt = f"""  
        You are an expert in Natural Language Processing (NLP) specializing in relation extraction.  
        Your task is to extract relations expressed strictly as triplets: (entity1, relation, entity2).  

        Constraints:
        - Both entity1 and entity2 must be valid named entities.  
        - At least one entity must come from the text and the other from the table.  
        - Only keep relations that are explicitly listed in the provided "Possible relation types".  
        - Ignore any relation not in this list.  
        - The extracted triplets must reflect connections valid, only when combining both sources (text + table), not when taken in isolation.  
        - If no valid triplet exists, return "NO_RELATION".  
        - Output must contain only the triplets in the required format. Do not include explanations, reasoning, or extra text.  

        Output Format (strict):  
        entity1, entity2: relation1 | entity3, entity4: relation2 | ...  

        
        Relation direction and semantics:
        
        - Use the conventional direction implied by the relation label. Examples if present in Possible relation types:
            . Acquired_by: e2 purchases controlling stake in e1. The relation is directed. The inverted relation is best described by the same relation type.
            . Brand of: e2 offers products or services of e1 (Brand). The relation is directed. The inverted relation is best described by the same relation type.
            . Client of: e1 uses (and presumably pays for) products or services offered by e2. The relation is directed. The inverted relation is best described by “Supplier of”.
            . Collaboration: e1 and e2 collaborate in (parts of their) business activities. The relation is undirected.
            . Competitor of: e1 competes for resources with e2. The relation is undirected.
            . Merged with: e1 and e2 merged (parts of) their business activities. The relation is undirected.
            . Product or service of: e1 is offered for commercial distribution by e2. The relation is directed. The inverted relation is best described by the same relation type.
            . Regulated by: e2 regulates (parts of) the business activity of e1. The relation is directed. The inverted relation is best described by the same relation type.
            . Shareholder of: e1 owns shares in e2. The relation is directed. The inverted relation is best described by the same relation type.
            . Subsidiary of: e2 legally owns e1. The relation is directed. The inverted relation is best described by “Parent of”.
            . Traded on: Shares of e1 are listed on e2 (Stock exchange). The relation is directed. The inverted relation is best described by “lists”.
        
        


        Available Data:  
        - Text segment: {v[0]}  
        - Table content:\n {table_to_csvlike(v[1][0])}  
        - Possible relation types: [{relations}]  

        Your task:  
        1. Identify relations where one entity is in the text and the other in the table.  
        2. Keep only relations that match the provided list. 
"""

    if num_prompt == 2: # à utiliser avec le modèle LLaMA
        Prompt = f"""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                As a Natural Language Processing (NLP) expert specializing in relation extraction.
        Your task is to identify and extract valid relations expressed as triplets (entity1, relation, entity2) 
        from both a given text segment and a table content.
        
        Constraints:
        - Both entity1 and entity2 must be valid named entities.
        - Only extract relations where one entity is from the text and the other is from the table.
        - Only use relations that are explicitly listed in the provided "Possible relation types".
        - If no valid relation exists, return "NO_RELATION".
        - The output must **only** contain the extracted triplets in the requested format, with no explanations, reasoning, or extra text.
        
        Output Format:
        entity1, entity2: relation1 | entity3, entity4: relation2 | ...
        
        <|eot_id|>
        
        <|start_header_id|>user<|end_header_id|>
        You are provided with:
        
        - Text segment: {v[0]}
        - Table content:\n {table_to_csvlike(v[1][0])}
        - Possible relation types: [{relations}]
        
        Your task:
        1. Identify relations where at least one entity is in the text and the other in the table.
        2. Construct relation triplets combining entities from both sources.
        3. Only keep relations that match the provided list of relation types.
        4. Return the result strictly in the format: entity1, entity2: relation1 | entity3, entity4: relation2
        
        If no valid relation exists, return "NO_RELATION".
        
        <|eot_id|>
        
        <|start_header_id|>assistant<|end_header_id|>
        """
    return Prompt
        

## 3.2 Few-shot Learning

### Prompt

In [53]:
trilets_gold = []
texts = []
tables = []
for k, v in list(d.items()):
    trilets_gold.append(v[2])
    texts.append(v[0])
    tables.append(table_to_csvlike(v[1][0]))

In [54]:
def recupere_prompt(num_prompt, i):
    if num_prompt == 1:
        Prompt = f"""  
        You are an expert in Natural Language Processing (NLP) specializing in relation extraction.  
        Your task is to extract relations expressed strictly as triplets: (entity1, relation, entity2).  

        * Constraints:
        - Both entity1 and entity2 must be valid named entities.  
        - At least one entity must come from the text and the other from the table.  
        - Only keep relations that are explicitly listed in the provided "Possible relation types".  
        - Ignore any relation not in this list.  
        - The extracted triplets must reflect connections valid, only when combining both sources (text + table), not when taken in isolation.  
        - If no valid triplet exists, return "NO_RELATION".  
        - Output must contain only the triplets in the required format. Do not include explanations, reasoning, or extra text.  

        * Output Format (strict):  
            entity1, entity2: relation1 | entity3, entity4: relation2 | ...  

        
        * Relation direction and semantics:
        
            - Use the conventional direction implied by the relation label. Examples if present in Possible relation types:
                . Acquired_by: e2 purchases controlling stake in e1. The relation is directed. The inverted relation is best described by the same relation type.
                . Brand of: e2 offers products or services of e1 (Brand). The relation is directed. The inverted relation is best described by the same relation type.
                . Client of: e1 uses (and presumably pays for) products or services offered by e2. The relation is directed. The inverted relation is best described by “Supplier of”.
                . Collaboration: e1 and e2 collaborate in (parts of their) business activities. The relation is undirected.
                . Competitor of: e1 competes for resources with e2. The relation is undirected.
                . Merged with: e1 and e2 merged (parts of) their business activities. The relation is undirected.
                . Product or service of: e1 is offered for commercial distribution by e2. The relation is directed. The inverted relation is best described by the same relation type.
                . Regulated by: e2 regulates (parts of) the business activity of e1. The relation is directed. The inverted relation is best described by the same relation type.
                . Shareholder of: e1 owns shares in e2. The relation is directed. The inverted relation is best described by the same relation type.
                . Subsidiary of: e2 legally owns e1. The relation is directed. The inverted relation is best described by “Parent of”.
                . Traded on: Shares of e1 are listed on e2 (Stock exchange). The relation is directed. The inverted relation is best described by “lists”.
            
        

        * Few-shot examples:
        
            - Example 1:
                . Text: {texts[1]}
                . Table:\n{tables[1]}
                . Output:\n{trilets_gold[1]}\n\n  
        
            - Example 2:
                . Text: {texts[4]}
                . Table:\n{tables[4]}
                . Output:\n{trilets_gold[4]} \n\n               
            - Example 3:
                . Text: {texts[5]}
                . Table:\n{tables[5]}
                . Output: NO_RELATION\n\n  
                
        Now process the new data
        
        - Text: {v[0]}
        - Table: {table_to_csvlike(v[1][0])}
        - Output : """
        
    if num_prompt == 2: # à utiliser avec le modèle LLaMA
        Prompt = f"""
          <|begin_of_text|><|start_header_id|>system<|end_header_id|>
            You are an NLP expert specializing in cross-source relation extraction. Your goal is to output only valid relation triplets (entity1, relation, entity2) that are jointly supported by a free-text passage and a tabular dataset.
            
            Inputs
            - Text
            - Table (CSV-like string)
            - Allowed relation types: [relations]
            
            Core requirements
            - Named entities only: entity1 and entity2 must be proper-noun entities (persons, organizations, companies, locations, products). Do not use generic/common nouns (e.g., "company", "city") or pure numbers/dates unless they are part of a named entity.
            - Cross-source constraint: each triplet must include at least one entity sourced from the text and at least one entity sourced from the table. If both entities appear in both sources, designate one as text-sourced and the other as table-sourced to satisfy the constraint.
            - Relation validity: use only labels from "Possible relation types". Respect semantic direction implied by the label (e.g., founded_by: subject=organization/company, object=person).
            - Evidence agreement: a triplet is valid only if (a) the text explicitly states or strongly implies the relation between the same two entities, and (b) the table contains those entities in the same row (across any columns). Table headers are not entities.
            - Matching and canonicalization:
              - Parse the first row as headers; subsequent rows are records.
              - Consider entity pairs formed within the same row across columns; do not form pairs using headers.
              - Match entities case-insensitively and after trimming whitespace.
              - When an entity appears in both sources, prefer the table cell’s spelling for output; otherwise, use the text surface form.
            - De-duplication and ordering: output each unique (entity1, relation, entity2) once. Sort triplets by relation, then entity1, then entity2 (case-insensitive) for deterministic output.
            
            Output format (strict)
            - If at least one valid triplet exists, output them on a single line:
              entity1, entity2: relation | entity3, entity4: relation | ...
            - Use ", " between entities, ": " before the relation, and " | " between triplets.
            - No trailing separator, no extra text, and no newline.
            - Escaping: if an entity contains a comma, colon, or pipe, wrap it in double quotes and escape embedded quotes by doubling them (e.g., "ACME, Inc.").
            - If no valid triplet exists, output exactly: NO_RELATION
            
            Procedure
            1) Extract named entities from the text.
            2) Parse the CSV-like table, collect cell values from each data row (ignore headers), and form candidate entity pairs within each row across columns.
            3) For each candidate pair, check whether the text expresses one of the allowed relations between the same two entities; assign the correct label and direction.
            4) Canonicalize entity strings, remove duplicates, sort (relation, entity1, entity2), and output in the strict format.
            
            Few-shot examples
            <|start_header_id|>user<|end_header_id|>
            \nExample 1:
            Text: 
            Table: 
            Relations: 
            Output: 
            
            Example 2:
            Text: 
            Table: 
            Relations: 
            Output: 
            
            \nExample 3:\n
            Text: 
            Table: 
            Relations: 
            Output: 
            
            Example 4 (no relation):
            Text: 
            Table: 
            Relations: 
            Output: NO_RELATION
            <|eot_id|>
            
            <|start_header_id|>user<|end_header_id|>
            Text: {v[0]}
            Table: {table_to_csvlike(v[1][0])}
            Possible relation types: [{relations}]
            
            Return only the triplets in the strict format.
            <|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """
    return Prompt
        

## 3.3 Lancer les prompts sur les LLMs

In [55]:
demo_instructions=[]
for i, v in list(d.items()):
    prompt = recupere_prompt(1, i)
    demo_instructions.append(prompt)

In [11]:
#print(demo_instructions[0])

In [56]:
resultatsF = []

In [111]:
resultats = []
for inst in range(252,255):
    reponse = complete_and_print(demo_instructions[inst])
    resultats.append(reponse)

In [122]:
resultatsF = resultatsF + resultats

In [123]:
len(resultatsF)

255

### Save results

In [124]:
df= pd.DataFrame(resultatsF)
df.to_csv("results_few-shot_DeepSeek.csv")

In [99]:
few_shot_result = pd.read_csv("results_fewshot_Llama4.csv")

In [100]:
few_shot_result= few_shot_result["0"].to_list()

In [70]:
zero_shot_result = zero_shot_result + resultatsF

#### Post-traitement (cas DeepSeek)

Ce script nettoie les réponses générées par le modèle DeepSeek en supprimant les balises de raisonnement internes (<think>...</think>) afin de ne conserver que le l'output final pertinent pour l’évaluation.

In [125]:
def extract_final_output(text):

    cleaned_text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [126]:
few_shot_result = []
for output in resultatsF:
    cleaned = extract_final_output(output)
    few_shot_result.append(cleaned)

## Preprocessing results

In [127]:
#Ce script isole automatiquement les triplets valides (entité1 ; entité2 ; relation) à partir des réponses textuelles du modèle,
#en filtrant les explications ou textes parasites souvent générés par les LLMs.

import re

def extract_triplets(llm_output: str) -> str:
    text = llm_output.strip()
    pattern = r'([A-Za-z0-9\s\(\)\.\-&]+)\s*[,;:]\s*([A-Za-z0-9\s\(\)\.\-&]+)\s*[,;:]\s*([A-Za-z_]+)'

    matches = re.findall(pattern, text)
    
    if not matches:
        return "NO_RELATION"
    
    triplets = []
    for e1, e2, rel in matches:
        e1 = e1.strip()
        e2 = e2.strip()
        rel = rel.strip()
        triplets.append(f"{e1}; {e2}; {rel}")
    
    triplets = list(dict.fromkeys(triplets))
    return " | ".join(triplets)

In [128]:
few_shot_resultF = []
for x in few_shot_result:
    few_shot_resultF.append(extract_triplets(x))

In [117]:
""" Cette fonction uniformise la structure des triplets extraits, en harmonisant la casse et la mise en forme (entité1 ; entité2 ; relation),
tout en filtrant les relations valides selon une liste prédéfinie."""
def extract_triplets_format(text):
    triplets = []
    
    if not text.strip():
        return triplets
    
    parts = re.split(r"\||\n", text)
    for part in parts:
        part = part.strip()
        if not part:
            continue
        if ";" in part:
            elems = [p.strip() for p in part.split(";")]
            try:
                if len(elems) == 3 and elems[1].lower() in relations or elems[2].lower() in relations:
                    if elems[1].lower() in relations:
                        triplets.append(f"""{elems[0].lower().replace(" ","")}; {elems[1].lower().replace(" ","")}; {elems[2].lower().replace(" ","")}""")
                    elif elems[2].lower() in relations:
                        triplets.append(f"""{elems[0].lower().replace(" ","")}; {elems[2].lower().replace(" ","")}; {elems[1].lower().replace(" ","")}""")
            except:
                triplets.append("x; NO_RELATION; x")
            continue
        
        m = re.match(r"(.+?),\s*(.+?):\s*(\w+)", part)
        if m:
            e1, e2, rel = m.groups()
            if rel in relations:
                triplets.append(f"""{e1.strip().lower().replace(" ","")}; {rel.strip().lower().replace(" ","")}; {e2.strip().lower().replace(" ","")}""")
    return triplets



In [129]:
few_shot_results = [extract_triplets_format(el) for el in few_shot_resultF]

In [119]:
trilets_gold = []
for k, v in list(d.items()):
    trilets_gold.append(v[2])
    
trilets_gold = [extract_triplets_format(el) for el in trilets_gold]

In [106]:
del trilets_gold[140]

In [130]:
len(few_shot_results)

255

# Evaluation

In [131]:
from Evaluation_triplets import evaluation_triplets

In [132]:
result = evaluation_triplets( few_shot_results,trilets_gold, relations)

In [133]:
print(result)

{'Exact matching': {'precision': 0.656934306569343, 'recall': 0.24112525117213665, 'f1': 0.35276825085742286}, 'Partial matching (head+tail)': {'precision': 0.7235401459854015, 'recall': 0.2655726724715338, 'f1': 0.3885350318471338}, 'Partial matching (relation + 1 entity)': {'precision': 0.7436131386861314, 'recall': 0.2729403884795713, 'f1': 0.3993140617344439}}
