In [1]:
import json
import re
import os
import spacy
import random
import datasets
import torch
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pwd

'/projects/melodi/mettaleb/FinGPT/fingpt/FinGPT_Benchmark/data'

## Data Challenge

In [1]:
relations= ['acquired_by',
 'brand_of',
 'client_of',
 'collaboration',
 'competitor_of',
 'merged_with',
 'product_or_service_of',
 'regulated_by',
 'shareholder_of',
 'subsidiary_of',
 'traded_on',
 'undefined']

In [2]:
import json

with open("/projects/melodi/mettaleb/Annotation/corpus_challenge/test/F2_nous.json", "r", encoding="utf-8") as f:
    data = json.load(f)

d = {}

for idx, doc in enumerate(data.get("documents", [])):
    # ---- TEXTS ----
    texts = []
    extraction_meta = doc.get("raw", {}).get("_source", {}).get("extractionMetadata", [])
    for meta in extraction_meta:
        for t in meta.get("texts", []):
            texts.append(t.get("value", ""))
    texts = " ".join(texts).strip()

    # ---- TABLES ----
    tables = []
    for meta in extraction_meta:
        for tbl in meta.get("tables", []):
            tables.append({"tableData": tbl.get("tableData", [])})

    # ---- TRIPLETS ----
    triplets_list = []
    for ann in doc.get("annotations", []):
        subj = ann.get("subject", {}).get("annotationValue", "")
        obj = ann.get("object", {}).get("annotationValue", "")
        pred_val = ann.get("predicate", {}).get("entityValue", "")
        # on ignore si predicate == pertinence
        if pred_val.lower() == "pertinence":
            continue
        triplet_str = f"{subj} ; {obj} ; {pred_val}"
        triplets_list.append(triplet_str)

    triplets = " | ".join(triplets_list)

    d[idx] = [texts, tables, triplets]

for k, v in list(d.items())[:2]:
    print(f"Doc {k}:")
    print("  Text:", v[0][:150], "...")
    print("  Nb tables:", len(v[1]))
    print("  Triplets:", v[2])
    print("-" * 50)


Doc 0:
  Text: GV Filmsis an Indianfilm productionanddistributioncompany headed byIshari K. Ganesh. The firm had been a leading production studio in the Tamil film i ...
  Nb tables: 1
  Triplets: 
--------------------------------------------------
Doc 1:
  Text: Spanfeller Media Group(SMG), asubsidiaryof publishing companyTribune Publishing, is adigital mediacompany based inNew York City. It was founded in 201 ...
  Nb tables: 1
  Triplets: Tribune Publishing ; Spanfeller Media Group(SMG) ; subsidiary_of
--------------------------------------------------


In [6]:
Texts = []
Tables = []
Triplets = []
for k in d:
    Texts.append(d[k][0])
    Tables.append(d[k][1])
    Triplets.append(d[k][2])

In [9]:
text_train = Texts[:170]
text_test = Texts[170:]
tables_train = Tables[:170]
tables_test = Tables[170:]
triplets_train = Triplets[:170]
triplets_test = Triplets[170:]

In [12]:
with open("./mydata/train.tup", "w") as f:
    for triplet in triplets_train:
        f.write(f"{triplet}\n")
with open("./mydata/test.tup", "w") as f:
    for triplet in triplets_train:
        f.write(f"{triplet}\n")
with open("./mydata/train.sent", "w") as f:
    for text in text_train:
        text = text.strip().replace("\n","")
        f.write(f"{text}\n")
with open("./mydata/test.sent", "w") as f:
    for text in text_test:
        text = text.strip().replace("\n","")
        f.write(f"{text}\n")    

In [27]:
#xml_tables_train = []
xml_tables_test = []
for table_group in tables_test:
    for tbl in table_group:
        root = ET.Element("table")
        for row_data in tbl["tableData"]:
            row_el = ET.SubElement(root, "row")
            for cell_data in row_data:
                cell_el = ET.SubElement(row_el, "cell")
                cell_el.text = str(cell_data).strip()
        xml_str = ET.tostring(root, encoding="unicode")
        xml_tables_test.append(xml_str)


In [28]:
tables_train = xml_tables_train
tables_test = xml_tables_test

In [29]:
len(tables_train) , len(tables_test) 

(170, 85)

## Version data Challenge

In [3]:

Prompt = f"""  
    You are an expert in Natural Language Processing (NLP) specializing in relation extraction. Your task is to extract relations expressed as triplets: (entity1, relation, entity2).\n
    Ensuring that:
    - Both entity1 and entity2 are valid named entities.
    - Each identified relation must originate jointly from two sources: the text and the table.
    - The extracted triplets must reflect connections that are only valid when considering both sources together, not when taken in isolation.
    
    Available Data:
    - Text segment:
    - Table content: 
    - Possible relation types: [{relations}]
    
    Tasks:
    - Identify relations where at least one entity is found in the text and the other entity is found in the table.
    - Construct relation triplets combining entities from both sources.
    - Return triplets in the following standardized format: entity1, entity2: relation1 | entity3, entity4: relations2
    Output Expected: A set of triplets in the precise format above.
    """


In [4]:
print(Prompt)

  
    You are an expert in Natural Language Processing (NLP) specializing in relation extraction. Your task is to extract relations expressed as triplets: (entity1, relation, entity2).

    Ensuring that:
    - Both entity1 and entity2 are valid named entities.
    - Each identified relation must originate jointly from two sources: the text and the table.
    - The extracted triplets must reflect connections that are only valid when considering both sources together, not when taken in isolation.
    
    Available Data:
    - Text segment:
    - Table content: 
    - Possible relation types: [['acquired_by', 'brand_of', 'client_of', 'collaboration', 'competitor_of', 'merged_with', 'product_or_service_of', 'regulated_by', 'shareholder_of', 'subsidiary_of', 'traded_on', 'undefined']]
    
    Tasks:
    - Identify relations where at least one entity is found in the text and the other entity is found in the table.
    - Construct relation triplets combining entities from both sources.
  

In [42]:
def get_instruction(sent, table, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"""  
    You are an expert in Natural Language Processing (NLP) specializing in relation extraction. Your task is to extract relations expressed as triplets: (entity1, relation, entity2).\n
    Ensuring that:
    - Both entity1 and entity2 are valid named entities.
    - Each identified relation must originate jointly from two sources: the text and the table.
    - The extracted triplets must reflect connections that are only valid when considering both sources together, not when taken in isolation.
    
    Available Data:
    - Text segment
    - Table content
    - Possible relation types: [{relations}]
    
    Tasks:
    - Identify relations where at least one entity is found in the text and the other entity is found in the table.
    - Construct relation triplets combining entities from both sources.
    - Return triplets in the following standardized format: entity1, entity2: relation1 | entity3, entity4: relations2
    Output Expected: A set of triplets in the precise format above.
    """)
        inp = "Text segment: "+sent + ".\n" + "Table content: "+str(table)
        inputs.extend([inp])
        #inputs.extend([table])
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" if len(tup) >= 3 else "" for tup in tuples])])
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            #instructions.append(f"""Analyze the following sentence and identify the relationship between the two mentioned entities. The relationship must be selected from the predefined list below. If none of the relationships apply, respond with "undefined".\nEntité 1 : {tup[0]}.\n \nEntité 2 : {tup[1]}.\nRelations : {'; '.join(relations)}.""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(tup_file, paragraphes, tables, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []
    with open(paragraphes) as f:
        sentences = [s.strip() for s in f.readlines()]
    print(f"{len(sentences)} phrases")
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    print(f"{len(tuples_list)} tuples")
        
    for sent, tuples, table in zip(sentences, tuples_list, tables):      
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        ins, i, o = get_instruction( sent, table, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple output : ",outputs[0])
    print("exemple input : ",inputs[0])
    print("instructions : ",instructions[0])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [43]:
train_dataset = get_finred_dataset('mydata/train.tup','mydata/train.sent',tables_train,  with_orig=True, with_cls=False)


170 phrases
170 tuples
longuer =  170
exemple output :  
exemple input :  Text segment: GV Filmsis an Indianfilm productionanddistributioncompany headed byIshari K. Ganesh. The firm had been a leading production studio in the Tamil film industry in the 1990s and had been founded byG. Venkateswaranas Sujatha Films in 1986.[1][2].Sujatha Films was set up in 1986 byG. Venkateswaran, a chartered accountant, as a film production and distribution company. Operating as a family production house, Venkateswaran's brotherMani Ratnamalso often assisted on the production work of films that he directed for the studio.[3]Sujatha Films became GV Films as it became the first publicly listed company from the Indian media industry in 1989.[4].Following Venkateswaran's death, the studio continued to produce media content under the same name. Notably, actressManisha Koiralawas briefly a board member as the studio attempted to make a comeback through Hindi film content and 3D television serials.[13]The stu

In [44]:
test_dataset = get_finred_dataset('mydata/test.tup','mydata/test.sent',tables_test,  with_orig=True, with_cls=False)

85 phrases
170 tuples
longuer =  85
exemple output :  
exemple input :  Text segment: Phase 4 Filmswas a Canadianfilm distributioncompany headquartered in Toronto. It had two branches in theU.S.:Los Angeles, CaliforniaandFort Mill, South Carolina. Its subsidiary, Kaboom! Entertainment markets children's entertainment with companies such asCorus Entertainment..Phase 4 Films traces its history to Telegenic, a family-oriented film distributor that was founded in 1996. Berry Meyerowitz purchased Telegenic in 2000 and renamed it as "Kaboom! Entertainment". In 2006,Peace Arch EntertainmentGroup, which later merged with ContentFilm, purchased Kaboom!. Berry Meyerowitz founded Phase 4 Films in April 2009 when he bought back their North American distribution business.[1].
Table content: <table><row><cell>0</cell><cell>1</cell></row><row><cell /><cell /></row><row><cell>Company type</cell><cell>Subsidiary</cell></row><row><cell>Industry</cell><cell>Film Animation</cell></row><row><cell>Predecess

In [45]:
finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 170/170 [00:00<00:00, 2419.17 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 5321.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 170
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 85
    })
})

## CORE Paragraphes

In [16]:
with open("/projects/melodi/mettaleb/Annotation/120_paragraphs.txt") as f:
    paragraphes = f.readlines()
with open("/projects/melodi/mettaleb/Annotation/Labelsmodifier.txt") as f:
    Labels = f.readlines()

In [36]:
len(Labels)

120

In [59]:
paragraphesF = []
LabelF = []
for i in range(len(Labels)):
    if "pas_de_label" not in Labels[i]:
        list_labels =  Labels[i].strip().replace('\n','').replace('"','').split(";")
        list_labels = [label for label in list_labels if label]
        for j in range(len(list_labels)):
            paragraphesF.append(paragraphes[i].strip())
            LabelF.append(list_labels[j])

In [60]:
len(paragraphesF), len(LabelF)

(553, 553)

In [61]:
LabelF[:15]

['the Kansas City Southern,The Baton Rouge Southern Railroad:client_of',
 'The Baton Rouge Southern Railroad,Watco:shareholder_of',
 'Energy Access Ventures,the company:shareholder_of',
 'Blue Haven Initiative,the company:shareholder_of',
 'Investisseurs & Partenaires,the company:shareholder_of',
 'ENGIE Rassembleurs d’Energies,the company:shareholder_of',
 'Impact Assets,the company:shareholder_of',
 'Acumen,the company:shareholder_of',
 'PCG Investments,the company:shareholder_of',
 'Blue-Tongue Films,Evermore:client_of',
 'Blue-Tongue Films,The Veronicas:client_of',
 'Blue-Tongue Films,Empire of the Sun:client_of',
 'Blue-Tongue Films,Rahzel:client_of',
 'Animal Kingdom,Blue-Tongue Films:product_or_service_of',
 'Hesher,Blue-Tongue Films:product_or_service_of']

In [62]:
with open("./mydata/116train667.sent", "w") as file:
    for i in range(len(paragraphesF[:400])):
        file.write(f"{paragraphesF[i]}\n")
with open("./mydata/116test667.sent", "w") as file:
    for i in range(400,len(paragraphesF)):
        file.write(f"{paragraphesF[i]}\n")
with open("./mydata/116train667.tup", "w") as file:
    for i in range(len(LabelF[:400])):
        label = LabelF[i].replace(",", " ; ").replace(":", " ; ")
        file.write(f"{label}\n")
with open("./mydata/116test667.tup", "w") as file:
    for i in range(400,len(LabelF)):
        label = LabelF[i].replace(",", " ; ").replace(":", " ; ")
        file.write(f"{label}\n")

In [76]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"""What is the relationship between '{{{tup[0]}}}' and '{{{tup[1]}}}' in the context of the input sentence. Choose an answer from: {{{'; '.join(relations)}}}.\nOutput the results in the format: {{relation}}""")
            #instructions.append(f"""Analyze the following sentence and identify the relationship between the two mentioned entities. The relationship must be selected from the predefined list below. If none of the relationships apply, respond with "undefined".\nEntité 1 : {tup[0]}.\n \nEntité 2 : {tup[1]}.\nRelations : {'; '.join(relations)}.""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
    print("exemple : ",instructions[:2])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [77]:
train_dataset = get_finred_dataset('mydata/116train667.sent', 'mydata/116train667.tup', with_orig=False, with_cls=True)


longuer =  400
exemple :  ['client_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'client_of']
exemple :  ["What is the relationship between '{the Kansas City Southern}' and '{The Baton Rouge Southern Railroad}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}", "What is the relationship between '{The Baton Rouge Southern Railroad}' and '{Watco}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}"]


In [78]:
test_dataset = get_finred_dataset('mydata/116test667.sent', 'mydata/116test667.tup', with_orig=False, with_cls=True)


longuer =  153
exemple :  ['product_or_service_of', 'product_or_service_of', 'traded_on', 'traded_on', 'traded_on', 'collaboration', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'product_or_service_of']
exemple :  ["What is the relationship between '{Coins}' and '{Coins ’N Things}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}", "What is the relationship between '{silver bullion coins}' and '{Coins ’N Things}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}"]


In [79]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 29718.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 153/153 [00:00<00:00, 14160.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 400
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 153
    })
})