In [3]:
import json
import re
import os
import spacy
import random
import datasets
import torch
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, BertTokenizer, BertModel

In [4]:
pwd

'/projects/melodi/mettaleb/FinGPT/fingpt/FinGPT_Benchmark/data'

# Prepare my data finRED

## CORE Phrases

In [5]:
nlp = spacy.load("en_core_web_lg")
def preprocess_text(text):
    processed_words = []
    text = text.replace('\n', '.')
    text = text.replace('\xa0', ' ')
    text = text.replace("\\xa"," ")
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'\/.*?\/', ' ', text)
    text = text.replace('\\', '')
    text = ' '.join(re.findall(r'\w+', text))
    doc = nlp(text)
    for token in doc:
        if not token.is_space and not token.is_punct:
            lemma = token.lemma_
            if len(lemma)>1:
                processed_words.append(lemma.lower())
    processed_text = ' '.join(processed_words)
    return processed_text

In [3]:
with open("/projects/melodi/mettaleb/CORE/data/train.json", "r", encoding="utf-8") as file:
    data = json.load(file)
df_train = pd.DataFrame(data)
df_train["text"] = [" ".join(df_train['context'].to_list()[i]) for i in range(len(df_train['context'].to_list()))]
with open("/projects/melodi/mettaleb/CORE/data/test.json", "r", encoding="utf-8") as file:
    data = json.load(file)
df_test = pd.DataFrame(data)
df_test["text"] = [" ".join(df_test['context'].to_list()[i]) for i in range(len(df_test['context'].to_list()))]
df_train.head()

Unnamed: 0,id,relation,invert_relation,e1_start,e1_end,e2_start,e2_end,e1_name,e2_name,context,text
0,E8026501,shareholder_of,1,0,0,15,17,Badgeville,Norwest Venture Partners,"[Badgeville, subsequently, raised, a, $, 12M, ...",Badgeville subsequently raised a $ 12M Series ...
1,E8034594,competitor_of,0,0,0,3,3,Bioverativ Inc.,Baxalta,"[Bioverativ, competes, with, Baxalta, (, acqui...",Bioverativ competes with Baxalta ( acquired by...
2,E8029931,client_of,1,0,0,12,14,Baton Rouge Southern Railroad,Kansas City Southern,"[It, also, serves, as, a, switching, and, car,...",It also serves as a switching and car storage ...
3,E8202016,shareholder_of,1,3,4,36,38,PEG Africa Ltd.,Blue Haven Initiative,"[In, 2017, ,, the, company, raised, a, further...","In 2017 , the company raised a further $ 13.5 ..."
4,E8110590,product_or_service_of,1,10,14,34,35,"Lewis Galoob Toys, Inc.",Micro Machines,"[Products, Toys, ,, video, games, ,, consumer,...","Products Toys , video games , consumer electro..."


In [4]:
df_test.shape

(708, 11)

## EDA : Easy Data Augmentation

In [4]:
df2 = pd.read_csv("/projects/melodi/mettaleb/eda_nlp/data/eda_core.txt", sep="\t")
df2.head()

Unnamed: 0,id,schoolbook
0,id,text
1,E8026501,series round norwest stake partners
2,E8026501,badgeville subsequently raised a m series b ro...
3,E8034594,bioverativ competes baxalta shire horse pfizer
4,E8034594,bioverativ competes with baxalta acquired by s...


In [5]:
merged_df = pd.merge(df_train, df2, on='id')
merged_df.head()

Unnamed: 0,id,relation,invert_relation,e1_start,e1_end,e2_start,e2_end,e1_name,e2_name,context,text,schoolbook
0,E8026501,shareholder_of,1,0,0,15,17,Badgeville,Norwest Venture Partners,"[Badgeville, subsequently, raised, a, $, 12M, ...",Badgeville subsequently raised a $ 12M Series ...,series round norwest stake partners
1,E8026501,shareholder_of,1,0,0,15,17,Badgeville,Norwest Venture Partners,"[Badgeville, subsequently, raised, a, $, 12M, ...",Badgeville subsequently raised a $ 12M Series ...,badgeville subsequently raised a m series b ro...
2,E8034594,competitor_of,0,0,0,3,3,Bioverativ Inc.,Baxalta,"[Bioverativ, competes, with, Baxalta, (, acqui...",Bioverativ competes with Baxalta ( acquired by...,bioverativ competes baxalta shire horse pfizer
3,E8034594,competitor_of,0,0,0,3,3,Bioverativ Inc.,Baxalta,"[Bioverativ, competes, with, Baxalta, (, acqui...",Bioverativ competes with Baxalta ( acquired by...,bioverativ competes with baxalta acquired by s...
4,E8029931,client_of,1,0,0,12,14,Baton Rouge Southern Railroad,Kansas City Southern,"[It, also, serves, as, a, switching, and, car,...",It also serves as a switching and car storage ...,switching car storage adroitness southern


In [6]:
merged_df = pd.merge(df_train, df2, on='id')

# Renommer les colonnes pour plus de clarté
merged_df = merged_df.rename(columns={'text': 'original_text', 'schoolbook': 'text'})
del merged_df['original_text']
# Afficher le résultat final
df_train = pd.concat([df_train, merged_df], ignore_index=True)


In [7]:
df_train.head()

Unnamed: 0,id,relation,invert_relation,e1_start,e1_end,e2_start,e2_end,e1_name,e2_name,context,text
0,E8026501,shareholder_of,1,0,0,15,17,Badgeville,Norwest Venture Partners,"[Badgeville, subsequently, raised, a, $, 12M, ...",Badgeville subsequently raised a $ 12M Series ...
1,E8034594,competitor_of,0,0,0,3,3,Bioverativ Inc.,Baxalta,"[Bioverativ, competes, with, Baxalta, (, acqui...",Bioverativ competes with Baxalta ( acquired by...
2,E8029931,client_of,1,0,0,12,14,Baton Rouge Southern Railroad,Kansas City Southern,"[It, also, serves, as, a, switching, and, car,...",It also serves as a switching and car storage ...
3,E8202016,shareholder_of,1,3,4,36,38,PEG Africa Ltd.,Blue Haven Initiative,"[In, 2017, ,, the, company, raised, a, further...","In 2017 , the company raised a further $ 13.5 ..."
4,E8110590,product_or_service_of,1,10,14,34,35,"Lewis Galoob Toys, Inc.",Micro Machines,"[Products, Toys, ,, video, games, ,, consumer,...","Products Toys , video games , consumer electro..."


In [9]:
df_train.shape

(12000, 11)

In [5]:
l_actor1 = df_test['e1_name'].to_list()
l_actor2 = df_test['e2_name'].to_list()
l_relation = df_test['relation'].to_list()
with open("./mydata/test.tup", "w") as file:
    for i in range(len(l_actor1)):
        file.write(f"{l_actor1[i]} ; {l_actor2[i]} ; {l_relation[i]}\n")
        #file.write(f"{l_relation[i].strip()}\n")
        
l_actor1 = df_train['e1_name'].to_list()
l_actor2 = df_train['e2_name'].to_list()
l_relation = df_train['relation'].to_list()
with open("./mydata/train.tup", "w") as file:
    for i in range(len(l_actor1)):
        file.write(f"{l_actor1[i]} ; {l_actor2[i]} ; {l_relation[i]}\n")
        #file.write(f"{l_relation[i].strip()}\n")

In [6]:
with open("./mydata/train.sent", "w") as file:
    for sentence in df_train["text"].to_list():
        file.write(f"{sentence}\n")
with open("./mydata/test.sent", "w") as file:
    for sentence in df_test["text"].to_list():
        file.write(f"{sentence}\n")

In [7]:
set(df_train['relation'].to_list())

{'acquired_by',
 'brand_of',
 'client_of',
 'collaboration',
 'competitor_of',
 'merged_with',
 'product_or_service_of',
 'regulated_by',
 'shareholder_of',
 'subsidiary_of',
 'traded_on',
 'undefined'}

In [8]:
pwd

'/projects/melodi/mettaleb/FinGPT/fingpt/FinGPT_Benchmark/data'

In [9]:
relations= ['acquired_by',
 'brand_of',
 'client_of',
 'collaboration',
 'competitor_of',
 'merged_with',
 'product_or_service_of',
 'regulated_by',
 'shareholder_of',
 'subsidiary_of',
 'traded_on',
 'undefined']

In [10]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            #instructions.append(f"""Analyze the following sentence and identify the relationship between the two mentioned entities. The relationship must be selected from the predefined list below. If none of the relationships apply, respond with "undefined".\nEntité 1 : {tup[0]}.\n \nEntité 2 : {tup[1]}.\nRelations : {'; '.join(relations)}.""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

## with multiple instruction 

In [19]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given phrases that describe the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"relation1: word1, word2; relation2: word3, word4\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input sentence, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"relation1: word1, word2; relation2: word3, word4\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \'relation1: entity1, entity2; relation2: entity3, entity4\'. Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            #instructions.append(f"Refer to the input text as context and select the correct relationship between '{tup[0]}' and '{tup[1]}' from the available options.\nOptions: {', '.join(relations)}")
            #instructions.append(f"Refer to the input text as context and select the correct relationship between '{tup[0]}' and '{tup[1]}' from the available options.\nOptions: {', '.join(relations)}")
            #instructions.append(f"Take context from the input text and decide on the accurate relationship between '{tup[0]}' and '{tup[1]}' from the options provided.\nOptions: {', '.join(relations)}")
            #instructions.append(f"What is the relationship between '{tup[0]}' and '{tup[1]}' in the context of the input sentence.\nOptions: {', '.join(relations)}")
            instructions.append(f"In the context of the input sentence, determine the relationship between '{tup[0]}' and '{tup[1]}'.\nOptions: {', '.join(relations)}")
            #instructions.append(f"Analyze the relationship between '{tup[0]}' and '{tup[1]}' within the context of the input sentence.\nOptions: {', '.join(relations)}")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    cpt=1   
    for sent, tuples in zip(sentences, tuples_list):
        #print(cpt)
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        cpt+=1
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

### Core

In [11]:
train_dataset = get_finred_dataset('mydata/train.sent', 'mydata/train.tup', with_orig=False, with_cls=True)


longuer =  4000
exemple :  ['shareholder_of', 'competitor_of', 'client_of', 'shareholder_of', 'product_or_service_of', 'product_or_service_of', 'collaboration', 'acquired_by', 'client_of', 'subsidiary_of']


In [12]:
test_dataset = get_finred_dataset('mydata/test.sent', 'mydata/test.tup', with_orig=False, with_cls=True)


longuer =  708
exemple :  ['undefined', 'product_or_service_of', 'shareholder_of', 'collaboration', 'subsidiary_of', 'subsidiary_of', 'client_of', 'product_or_service_of', 'undefined', 'competitor_of']


In [13]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 4000/4000 [00:00<00:00, 184722.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 708/708 [00:00<00:00, 62461.98 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 708
    })
})

In [14]:
train_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 4000
})

## CORE Paragraphes

In [16]:
with open("/projects/melodi/mettaleb/Annotation/120_paragraphs.txt") as f:
    paragraphes = f.readlines()
with open("/projects/melodi/mettaleb/Annotation/Labelsmodifier.txt") as f:
    Labels = f.readlines()

In [36]:
len(Labels)

120

In [59]:
paragraphesF = []
LabelF = []
for i in range(len(Labels)):
    if "pas_de_label" not in Labels[i]:
        list_labels =  Labels[i].strip().replace('\n','').replace('"','').split(";")
        list_labels = [label for label in list_labels if label]
        for j in range(len(list_labels)):
            paragraphesF.append(paragraphes[i].strip())
            LabelF.append(list_labels[j])

In [60]:
len(paragraphesF), len(LabelF)

(553, 553)

In [61]:
LabelF[:15]

['the Kansas City Southern,The Baton Rouge Southern Railroad:client_of',
 'The Baton Rouge Southern Railroad,Watco:shareholder_of',
 'Energy Access Ventures,the company:shareholder_of',
 'Blue Haven Initiative,the company:shareholder_of',
 'Investisseurs & Partenaires,the company:shareholder_of',
 'ENGIE Rassembleurs d’Energies,the company:shareholder_of',
 'Impact Assets,the company:shareholder_of',
 'Acumen,the company:shareholder_of',
 'PCG Investments,the company:shareholder_of',
 'Blue-Tongue Films,Evermore:client_of',
 'Blue-Tongue Films,The Veronicas:client_of',
 'Blue-Tongue Films,Empire of the Sun:client_of',
 'Blue-Tongue Films,Rahzel:client_of',
 'Animal Kingdom,Blue-Tongue Films:product_or_service_of',
 'Hesher,Blue-Tongue Films:product_or_service_of']

In [62]:
with open("./mydata/116train667.sent", "w") as file:
    for i in range(len(paragraphesF[:400])):
        file.write(f"{paragraphesF[i]}\n")
with open("./mydata/116test667.sent", "w") as file:
    for i in range(400,len(paragraphesF)):
        file.write(f"{paragraphesF[i]}\n")
with open("./mydata/116train667.tup", "w") as file:
    for i in range(len(LabelF[:400])):
        label = LabelF[i].replace(",", " ; ").replace(":", " ; ")
        file.write(f"{label}\n")
with open("./mydata/116test667.tup", "w") as file:
    for i in range(400,len(LabelF)):
        label = LabelF[i].replace(",", " ; ").replace(":", " ; ")
        file.write(f"{label}\n")

In [76]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"""What is the relationship between '{{{tup[0]}}}' and '{{{tup[1]}}}' in the context of the input sentence. Choose an answer from: {{{'; '.join(relations)}}}.\nOutput the results in the format: {{relation}}""")
            #instructions.append(f"""Analyze the following sentence and identify the relationship between the two mentioned entities. The relationship must be selected from the predefined list below. If none of the relationships apply, respond with "undefined".\nEntité 1 : {tup[0]}.\n \nEntité 2 : {tup[1]}.\nRelations : {'; '.join(relations)}.""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
    print("exemple : ",instructions[:2])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [77]:
train_dataset = get_finred_dataset('mydata/116train667.sent', 'mydata/116train667.tup', with_orig=False, with_cls=True)


longuer =  400
exemple :  ['client_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'client_of']
exemple :  ["What is the relationship between '{the Kansas City Southern}' and '{The Baton Rouge Southern Railroad}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}", "What is the relationship between '{The Baton Rouge Southern Railroad}' and '{Watco}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}"]


In [78]:
test_dataset = get_finred_dataset('mydata/116test667.sent', 'mydata/116test667.tup', with_orig=False, with_cls=True)


longuer =  153
exemple :  ['product_or_service_of', 'product_or_service_of', 'traded_on', 'traded_on', 'traded_on', 'collaboration', 'shareholder_of', 'shareholder_of', 'shareholder_of', 'product_or_service_of']
exemple :  ["What is the relationship between '{Coins}' and '{Coins ’N Things}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}", "What is the relationship between '{silver bullion coins}' and '{Coins ’N Things}' in the context of the input sentence. Choose an answer from: {acquired_by; brand_of; client_of; collaboration; competitor_of; merged_with; product_or_service_of; regulated_by; shareholder_of; subsidiary_of; traded_on; undefined}.\nOutput the results in the format: {relation}"]


In [79]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 29718.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 153/153 [00:00<00:00, 14160.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 400
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 153
    })
})

## TextMine

### Strategie 1 : 
 * Donner le paragraphe , toutes les entities (avec ou **SANS** les types des entities), il doit tourver la relation.

In [2]:
    
def get_instruction(sent, tuples, entities, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two entities, extract All entities pair and the corresponding lexical relationship between them from the input paragraph. The output format should be \"{{relation1: entity1, entity2}}; {{relation2: entity3, entity4}}\". Entities must be in this list: {', '.join(entities)}. Relations must be in this list: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract All entities pair containing a certain relation in the paragraph according to the following relation types, in the format of \"{{relation1: entity1, entity2}}; {{relation2: entity3, entity}}\". Entities must be in this list: {', '.join(entities)}. Relations should be include: {'; '.join(relations)}.")
        instructions.append(f"I Provide you with paragraph containing entities. Generate the relations between entity pairs from the following list: {', '.join(entities)}. Output the results in the format \"{{relation1: entity1, entity}}; {{relation2: entity3, entity4}}\". Relations should be determined based on the context provided in the paragraph. Relations must be in this list: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, and choose the correct relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs

def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]

    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    
    entities_list=[]
    with open(tup_file) as f:
        for ligne in f.readlines():
            triplets = ligne.split(' | ')
            entities_set = set()
            for triplet in triplets:
                entity1, entity2, _ = triplet.split(' ; ')
                entities_set.add(entity1)
                entities_set.add(entity2)
            entities_list.append(entities_set)    
    cpt=1   
    for sent, tuples, entities in zip(sentences, tuples_list, entities_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        ins, i, o = get_instruction(sent, tuples, entities, with_orig, with_cls)
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        cpt+=1
    #print(entities)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:1])
    for i in range(len(outputs)):
        if not outputs[i]:
            print(f"l'output {i} est vide")
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [73]:
print(relat)

['RESIDES_IN', 'HAS_CONTROL_OVER', 'IS_COOPERATING_WITH', 'IS_BORN_ON', 'HAS_LONGITUDE', 'HAS_LATITUDE', 'IS_OF_SIZE', 'IS_BORN_IN', 'IS_PART_OF', 'IS_AT_ODDS_WITH', 'IS_LOCATED_IN', 'STARTED_IN', 'IS_OF_NATIONALITY', 'HAS_COLOR', 'WEIGHS', 'IS_REGISTERED_AS', 'START_DATE', 'INJURED_NUMBER', 'DIED_IN', 'DEATHS_NUMBER', 'CREATED', 'GENDER_FEMALE', 'INITIATED', 'HAS_FOR_HEIGHT', 'HAS_FOR_LENGTH', 'HAS_CATEGORY', 'HAS_CONSEQUENCE', 'WAS_CREATED_IN', 'HAS_FAMILY_RELATIONSHIP', 'OPERATES_IN', 'WAS_DISSOLVED_IN', 'IS_DEAD_ON', 'HAS_QUANTITY', 'HAS_FOR_WIDTH', 'END_DATE', 'IS_IN_CONTACT_WITH', 'GENDER_MALE']


In [76]:
len(relat)

37

In [7]:
relations = ['RESIDES_IN', 'HAS_CONTROL_OVER', 'IS_COOPERATING_WITH', 'IS_BORN_ON', 'HAS_LONGITUDE', 'HAS_LATITUDE', 'IS_OF_SIZE', 'IS_BORN_IN', 'IS_PART_OF', 'IS_AT_ODDS_WITH', 'IS_LOCATED_IN', 'STARTED_IN', 'IS_OF_NATIONALITY', 'HAS_COLOR', 'WEIGHS', 'IS_REGISTERED_AS', 'START_DATE', 'INJURED_NUMBER', 'DIED_IN', 'DEATHS_NUMBER', 'CREATED', 'GENDER_FEMALE', 'INITIATED', 'HAS_FOR_HEIGHT', 'HAS_FOR_LENGTH', 'HAS_CATEGORY', 'HAS_CONSEQUENCE', 'WAS_CREATED_IN', 'HAS_FAMILY_RELATIONSHIP', 'OPERATES_IN', 'WAS_DISSOLVED_IN', 'IS_DEAD_ON', 'HAS_QUANTITY', 'HAS_FOR_WIDTH', 'END_DATE', 'IS_IN_CONTACT_WITH', 'GENDER_MALE']

In [8]:
relations_trad = {'RESIDES_IN': 'RÉSIDE_DANS','IS_OF_SIZE': 'EST_DE_TAILLE',    'IS_BORN_ON': 'EST_NÉ_LE',    'CREATED': 'CRÉÉ',    'HAS_CONSEQUENCE': 'A_DES_CONSÉQUENCES',    'HAS_FOR_LENGTH': 'A_POUR_LONGUEUR',    'DIED_IN': 'MORT_EN',    'START_DATE': 'DATE_DE_DÉBUT',    'INITIATED': 'INITIÉ',    'HAS_CATEGORY': 'A_UNE_CATEGORIE',    'HAS_LATITUDE': 'A_LA_LATITUDE',    'GENDER_FEMALE': 'GENRE_FEMININ',    'DEATHS_NUMBER': 'NOMBRE_DE_MORTS',    'GENDER_MALE': 'GENRE_MASCULIN',    'IS_PART_OF': 'FAIT_PARTIE_DE',    'WEIGHS': 'PESENT',    'IS_REGISTERED_AS': 'EST_ENREGISTRÉ_COMME',    'HAS_QUANTITY': 'A_UNE_QUANTITÉ',    'IS_OF_NATIONALITY': 'EST_DE_NATIONALITÉ',    'INJURED_NUMBER': 'NOMBRE_DE_BLESSÉS',    'END_DATE': 'DATE_DE_FIN',    'HAS_CONTROL_OVER': 'A_UNE_CE_CONTROLE_SUR',    'IS_COOPERATING_WITH': 'COOPÈRE_AVEC',    'IS_BORN_IN': 'EST_NÉ_A',    'HAS_FOR_WIDTH': 'A_POUR_LARGEUR',    'IS_AT_ODDS_WITH': 'EST_EN_CONFLIT_AVEC',    'HAS_COLOR': 'A_COULEUR',    'HAS_FAMILY_RELATIONSHIP': 'A_UNE_RELATION_FAMILIALE',    'WAS_DISSOLVED_IN': 'A_ÉTÉ_DISSOUTE_EN',    'HAS_FOR_HEIGHT': 'A_POUR_HAUTEUR',    'IS_DEAD_ON': 'EST_MORT_LE',    'STARTED_IN': 'A_COMMENCÉ_EN',    'OPERATES_IN': 'OPÈRE_EN',    'IS_LOCATED_IN': 'EST_LOCALISÉ_EN',    'WAS_CREATED_IN': 'A_ÉTÉ_CRÉÉ_EN',    'HAS_LONGITUDE': 'A_LA_LONGITUDE','IS_IN_CONTACT_WITH': 'EST_EN_CONTACT_AVEC', 'PAS_DE_RELATION': 'PAS_DE_RELATION'}

In [None]:
train_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/train.sent', '/projects/melodi/mettaleb/Textmine/train_small.tup', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/test.sent', '/projects/melodi/mettaleb/Textmine/test_small.tup', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

### Strategie 2 : 
 * Donner le paragraphe , toutes les entities (**AVEC** les types des entities), il doit tourver la relation.

In [9]:
dif_entities = {
    "Actor": ["Organization",  "Organisation", "Person"],
    "Organization": ["Gouvernement Organization"  , "Group of Individuals", "Intergovernmental Organization", "Intergovernmental Organisation", "Non Governmental Organization", "Non Governmental Organisation"],
    "Organisation": ["Gouvernement Organisation", "Group of Individuals", "Intergovernmental Organization", "Intergovernmental Organisation", "Non Governmental Organization", "Non Governmental Organisation"],
    "Gouvernement Organization": ["Military Organization", "Non Military Organization","NON_MILITARY_GOVERNMENT_ORGANIZATION"],
    "Gouvernement Organisation": ["Military Organisation", "Non Military Organisation","NON_MILITARY_GOVERNMENT_ORGANISATION"],
    "Person": ["Civilian", "Criminal", "Military"],
    "Event": ["Accident", "CBRN Event", "Civil Unrest", "Criminal Event", "Large Scale Event"],
    "Civil Unrest": ["Agitating Trouble Making", "Civil War Outbreak", "Coup d'État", "Demonstration", "Election",
                     "Gathering", "Illegal Civil Demonstration", "Natural Causes Death", "Riot", "Strike", "Suicide"],
    "Criminal Event": ["Bombing", "Criminal Arrest", "Drug Operation", "Hooliganism Trouble-making", "Political Violence", "Theft", "Trafficking"],
    "Large Scale Event": ["Economical Crisis", "Epidemic", "Fire", "Natural Event", "Pollution"],
    "Material": ["MATERIEL","Material"],
    "Place": ["Place"],
    "TERRORIST_OR_CRIMINAL": ['TERRORIST_OR_CRIMINAL']
}
dif_Attrinuts = {"Category": 0,"Color": 0,"First Name": 0,"Height": 0,"Last Name": 0,"Length": 0,"Material Reference": 0,"Nationality": 0,"Quantity": ["Quantity Exact","Quantity Fuzzy","Quantity Max","Quantity Min"],"Time": ["Time Exact","Time Fuzzy","Time Max","Time Min"]}
Relations_Definition = ['Actor,Is_Located_In,Place','Event,Is_Located_In,Place','Place,Is_Located_In,Place','Actor,Is_of_Nationality,Nationality','Place,Is_of_Nationality,Nationality', 'Actor,Created,Organization','Actor,Has_Control_Over,Actor','Actor,Has_Control_Over,Material','Actor,Has_Control_Over,Place','Actor,Initiated,Event', 'Actor,Is_At_Odds_With,Actor','Actor,Is_Cooperating_With,Actor','Actor,Is_In_Contact_With,Actor','Actor,Is_Part_Of,Organization','Event,Deaths_Number,Quantity', 'Event,End_Date,Time','Event,Has_Consequence,Event','Event,Injured_Number,Quantity','Event,Start_Date,Time','Event,Started_In,Place', 'Material,Has_Color,Color','Material,Has_for_Height,Height','Material,Has_for_Length,Length','Material,Has_for_Width,Width','Material,Has_Quantity,Quantity', 'Material,Is_Registered_As,Material reference', 'Material,Weighs,Weight', 'Organization,Was_Created_In,Time', 'Organization,Was_Dissolved_In,Time', 'Organization,Is_Of_Size,Quantity', 'Organization,Operates_In,Place', 'Person,Died_In,Event', 'Person,Has_Category,Category', 'Person,Has_Family_Relationship,Person',  'Person,Gender_Female,N/A', 'Person,Gender_Male,N/A', 'Person,Is_Born_In,Place','Person,Is_Born_On,Time', 'Person,Is_Dead_On,Time', 'Person,Resides_In,Place', 'Place,Is_Located_In,Place','Actor,Is_of_Nationality,Nationality','Place,Is_of_Nationality,Nationality', 'Place,Has_Latitude,Latitude','Place,Has_Longitude,Longitude']

In [5]:
"""def find_original_parent(term, dif_entities):
    for parent, subclasses in dif_entities.items():
        if term in subclasses:
            return find_original_parent(parent, dif_entities)
    return term
def find_attribute(term, dif_Attrinuts):
    for key, values in dif_Attrinuts.items():
        if term == key:
            return key
        elif isinstance(values, list) and term in values:
            return key
    return None
def find_type_entity(term):
    if find_attribute(term, dif_Attrinuts):
        type_entity = find_attribute(term, dif_Attrinuts)
    else:
        type_entity = find_original_parent(term, dif_entities)
    return type_entity"""

'def find_original_parent(term, dif_entities):\n    for parent, subclasses in dif_entities.items():\n        if term in subclasses:\n            return find_original_parent(parent, dif_entities)\n    return term\ndef find_attribute(term, dif_Attrinuts):\n    for key, values in dif_Attrinuts.items():\n        if term == key:\n            return key\n        elif isinstance(values, list) and term in values:\n            return key\n    return None\ndef find_type_entity(term):\n    if find_attribute(term, dif_Attrinuts):\n        type_entity = find_attribute(term, dif_Attrinuts)\n    else:\n        type_entity = find_original_parent(term, dif_entities)\n    return type_entity'

In [92]:
def normalize(term):
    return re.sub(r'[\W_]+', '', term).lower()
def find_superclasses(entity, dif_entities):
    superclasses = []
    normalized_entity = normalize(entity)
    
    def search_parents(subclass):
        normalized_subclass = normalize(subclass)
        for superclass, subclasses in dif_entities.items():
            normalized_superclass = normalize(superclass)
            if any(normalize(sub) == normalized_subclass for sub in subclasses):
                if normalized_superclass not in superclasses:
                    superclasses.append(superclass)
                    search_parents(superclass)
    
    # Commencer la recherche pour l'entité donnée
    search_parents(entity)
    return list(superclasses)

# Exemple d'utilisation
entity = "Non Military Organization"
print(f"Les classes mères de '{entity}' sont :", find_superclasses(entity, dif_entities))

entity = "person"
print(f"Les classes mères de '{entity}' sont :", find_superclasses(entity, dif_entities))

Les classes mères de 'Non Military Organization' sont : ['Gouvernement Organization', 'Organization', 'Actor']
Les classes mères de 'person' sont : ['Actor']


In [10]:
import re
import networkx as nx
import matplotlib.pyplot as plt

def create_graph(data):
    G = nx.DiGraph()
    for parent, children in data.items():
        for child in children:
            G.add_edge(parent.upper(), child.upper())
    return G
graph = create_graph(dif_entities)

def find_super_and_sub_classes(graph, entity):
    super_classes = list(nx.ancestors(graph, entity))  

    sub_classes = list(graph.successors(entity))  
    L_term = []
    L_term.append(entity)    
    return super_classes if super_classes else L_term


def normalize_term(term):
    return re.sub(r'[\W_]+', '', term).lower()


def normalize(term):
    return re.sub(r'[\W_]+', '', term).lower()

# Fonction pour récupérer les classes mères d'une entité
def find_superclasses(entity, dif_entities):
    superclasses = set()
    visited = set()  # Ensemble pour garder une trace des classes déjà visitées
    normalized_entity = normalize(entity)
    
    def search_parents(subclass):
        normalized_subclass = normalize(subclass)
        
        # Si déjà visité, arrêter pour éviter les boucles infinies
        if normalized_subclass in visited:
            return
        
        visited.add(normalized_subclass)  # Marquer comme visité
        for superclass, subclasses in dif_entities.items():
            normalized_superclass = normalize(superclass)
            if any(normalize(sub) == normalized_subclass for sub in subclasses):
                if normalized_superclass not in superclasses:
                    superclasses.add(superclass)
                    search_parents(superclass)
    
    # Commencer la recherche pour l'entité donnée
    search_parents(entity)
    return list(superclasses)

def find_original_parent(term, dif_entities):
    normalized_term = normalize_term(term)
    for parent, subclasses in dif_entities.items():
        normalized_subclasses = [normalize_term(subclass) for subclass in subclasses]
        if normalized_term in normalized_subclasses:
            return find_original_parent(parent, dif_entities)
    L_term = []
    L_term.append(term)
    return L_term

def find_attribute(term, dif_Attrinuts):
    normalized_term = normalize_term(term)
    for key, values in dif_Attrinuts.items():
        normalized_key = normalize_term(key)
        
        if normalized_term == normalized_key:
            return key
        elif isinstance(values, list):
            normalized_values = [normalize_term(value) for value in values]
            if normalized_term in normalized_values:
                return key
    return None

def find_type_entity(term):
    if find_attribute(term, dif_Attrinuts):
        type_entity = []
        type_entity_a = find_attribute(term, dif_Attrinuts)
        type_entity.append(type_entity_a)
    else:
        #type_entity = find_original_parent(term, dif_entities)
        type_entity = find_superclasses(term, dif_entities)
    return type_entity


In [6]:
    
def get_instruction(sent, tuples, entities, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two entities, extract All entities pair and the corresponding lexical relationship between them from the input paragraph. The output format should be \"{{relation1: entity1, entity2}}; {{relation2: entity3, entity4}}\". Entities must be in this list(in the form entity:Type_entity): {', '.join(entities)}. Relations must be in this list: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract All entities pair containing a certain relation in the paragraph according to the following relation types, in the format of \"{{relation1: entity1, entity2}}; {{relation2: entity3, entity}}\". Entities must be in this list(in the form entity:Type_entity): {', '.join(entities)}. Relations should be include: {'; '.join(relations)}.")
        instructions.append(f"I Provide you with paragraph containing entities. Generate the relations between entity pairs from the following list(in the form entity:Type_entity): {', '.join(entities)}. Output the results in the format \"{{relation1: entity1, entity}}; {{relation2: entity3, entity4}}\". Relations should be determined based on the context provided in the paragraph. Relations must be in this list: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, and choose the correct relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs

def get_finred_dataset(sent_file, tup_file,entity_types_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]

    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    
    entities_list=[]
    with open(entity_types_file) as f:
        for ligne in f.readlines():
            triplets = ligne.strip().split(';')
            entities_set = []
            for triplet in triplets:
                #entity,type_entity= triplet.split('_')
                entity, type_entity = triplet.split('_', 1)
                type_entity = find_type_entity(type_entity)
                final_entity = f"{entity}:{type_entity}"
                entities_set.append(final_entity)
            entities_list.append(entities_set)    
    cpt=1
    print(entities_list[0])
    for sent, tuples, entities in zip(sentences, tuples_list, entities_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        ins, i, o = get_instruction(sent, tuples, entities, with_orig, with_cls)
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        cpt+=1
    #print(entities)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:1])
    for i in range(len(outputs)):
        if not outputs[i]:
            print(f"l'output {i} est vide")
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [59]:
train_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/train.sent', '/projects/melodi/mettaleb/Textmine/train_small.tup', '/projects/melodi/mettaleb/Textmine/train_entities_types.txt', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/test.sent', '/projects/melodi/mettaleb/Textmine/test_small.tup','/projects/melodi/mettaleb/Textmine/test_entities_types.txt', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

['ONG:Actor', 'moto:MATERIEL', 'accident de circulation:Event', 'passagers:Actor', 'Destresse:Last Name', 'deux:Quantity', 'garde du corps:Actor', 'Ma passion:Actor', 'Il:TERRORIST_OR_CRIMINAL', 'conducteur:Category', 'président:Category', 'blessés:Actor', 'Anam:First Name', 'Anam Destresse:Actor', 'accident:Event', '30 juin 2022:Time', 'conducteur:TERRORIST_OR_CRIMINAL', 'Italie:PLACE', 'panneaux de signalisation:MATERIEL', 'hélicoptère:MATERIEL', 'autoroute de Saint-Marin:PLACE', '20:Quantity', 'hôpital:PLACE', 'garde du corps:Category', 'bus:MATERIEL']
longuer =  1800
exemple :  ['STARTED_IN: accident de circulation, autoroute de Saint-Marin; GENDER_FEMALE: Anam Destresse, Anam Destresse; HAS_CATEGORY: Anam Destresse, président; IS_DEAD_ON: conducteur, 30 juin 2022; HAS_CATEGORY: garde du corps, garde du corps; GENDER_MALE: garde du corps, garde du corps; HAS_CATEGORY: conducteur, conducteur; STARTED_IN: accident, Italie; START_DATE: accident, 30 juin 2022; IS_LOCATED_IN: garde du c

Saving the dataset (1/1 shards): 100%|██████████| 1800/1800 [00:00<00:00, 45512.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 600/600 [00:00<00:00, 31340.15 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 600
    })
})

### Strategie 3 :
 * **Etape 1. Combinaison:** Extraire tous les paires d'entités possibles (**SANS** Type) </br>
 * **Etape 2. Filtrage:** (A) On cherche la classe mère de chaque entité. (B) Vérifier si la combinaison possible ou pas
 * **Etape 3. Prompt:** prompt pour chaque (ei,ej) avec que les relations possibles avec une relation **Autre**. On donne le paragraphe complet ou bien que le span entre les deux entitiés   

In [11]:
df_test = pd.read_csv('/projects/melodi/mettaleb/Textmine/data/test.csv')
#df_train = df_train.set_index("id")
df_test.entities = df_test.entities.apply(json.loads)
#df_test.relations = df_test.relations.apply(json.loads)
df_test.head()

Unnamed: 0,id,text,entities
0,1204,“FEAR” est une organisation spécialisée dans l...,"[{'id': 0, 'mentions': [{'value': 'crash', 'st..."
1,4909,"48 heures après le braquage du ""DC Supermarket...","[{'id': 0, 'mentions': [{'value': 'DC Supermar..."
2,2353,"À Genève, la boîte de nuit ""Pretty Woman"" a en...","[{'id': 0, 'mentions': [{'value': 'tombé', 'st..."
3,1210,"À Caracas, l'an dernier, une mine située à l'o...","[{'id': 0, 'mentions': [{'value': 'effondremen..."
4,41948,"À Genève, une industrie de fabrication d'arbre...","[{'id': 0, 'mentions': [{'value': 'incendiée',..."


#### ETAPE 1

In [12]:
combinaison_train=[]
with open("/projects/melodi/mettaleb/Textmine/train_entities_pairs.txt", "r") as f1:
    for ligne in f1.readlines():
        combinaison_train.append(ligne.replace("\n",""))
combinaison_test=[]
with open("/projects/melodi/mettaleb/Textmine/test_entities_pairs.txt", "r") as f1:
    for ligne in f1.readlines():
        combinaison_test.append(ligne.replace("\n",""))

In [13]:
combinaison_soumission=[]
with open("/projects/melodi/mettaleb/Textmine/soumission_entities_pairs.txt", "r") as f1:
    for ligne in f1.readlines():
        combinaison_soumission.append(ligne.replace("\n",""))

In [10]:
len(combinaison_soumission)

400

#### ETAPE 2 : Préprocessing

In [84]:
print(Relations_Definition)

['Actor,Is_Located_In,Place', 'Event,Is_Located_In,Place', 'Place,Is_Located_In,Place', 'Actor,Is_of_Nationality,Nationality', 'Place,Is_of_Nationality,Nationality', 'Actor,Created,Organization', 'Actor,Has_Control_Over,Actor', 'Actor,Has_Control_Over,Material', 'Actor,Has_Control_Over,Place', 'Actor,Initiated,Event', 'Actor,Is_At_Odds_With,Actor', 'Actor,Is_Cooperating_With,Actor', 'Actor,Is_In_Contact_With,Actor', 'Actor,Is_Part_Of,Organization', 'Event,Deaths_Number,Quantity', 'Event,End_Date,Time', 'Event,Has_Consequence,Event', 'Event,Injured_Number,Quantity', 'Event,Start_Date,Time', 'Event,Started_In,Place', 'Material,Has_Color,Color', 'Material,Has_for_Height,Height', 'Material,Has_for_Length,Length', 'Material,Has_for_Width,Width', 'Material,Has_Quantity,Quantity', 'Material,Is_Registered_As,Material reference', 'Material,Weighs,Weight', 'Organization,Was_Created_In,Time', 'Organization,Was_Dissolved_In,Time', 'Organization,Is_Of_Size,Quantity', 'Organization,Operates_In,Place

In [14]:
def check_entity_pair(entity1, entity2, relations_definitions):
    for relation in relations_definitions:
        entities = relation.split(',')
        if len(entities) == 3 and entities[0].lower() == entity1.lower() and entities[2].lower() == entity2.lower():
            #print(relation)
            return True
    return False

In [15]:
#combinaison_train_F=[]
#combinaison_test_F=[]
#combinaison_soumission_F=[]
X = ["(Bruno Alves_CIVILIAN,Lisbonne_PLACE)","(Bruno Alves_CIVILIAN,Lisbonne_PLACE)"]
def verif_combinaison(all_com):
    final_comb = []
    for i in range(len(all_com)):
        pairs = all_com[i].split(";")
        vrai_pairs=[]
        set_civilian = set()
        for j in range(len(pairs)):
            #print(pairs[j],j)
            if "_" in pairs[j]:
                if "_" in pairs[j].split(",")[0] and "_" in pairs[j].split(",")[1]:
                    e1,type_e1 = pairs[j].split(",")[0].replace("(","").split("_",1)
                    e2,type_e2 = pairs[j].split(",")[1].replace(")","").split("_",1)
                    #class_e1 = find_type_entity(type_e1).replace('MATERIEL',"MATERIAL")
                    #class_e2 = find_type_entity(type_e2).replace('MATERIEL',"MATERIAL")
                    if type_e1 == 'CIVILIAN':
                        entity_civil = f"({e1}_PERSON,{e1}_N/A)"
                        set_civilian.add(entity_civil)
                    
                    class_e1 = find_type_entity(type_e1)
                    class_e2 = find_type_entity(type_e2)
                    class_e1.reverse()
                    class_e2.reverse()
                    #print(class_e1,class_e2)
                    trouve = False
                    for ent_e1 in class_e1:
                        for ent_e2 in class_e2:
                            if check_entity_pair(ent_e1, ent_e2, Relations_Definition) and not trouve:
                                #print(ent_e1, ent_e2)
                                pair_exact = f"({e1}_{ent_e1},{e2}_{ent_e2})"
                                #vrai_pairs.append(pairs[j])
                                vrai_pairs.append(pair_exact)
                                trouve =True
        set_civilian_l = list(set_civilian)
        vrai_pairs.extend(set_civilian_l)
        final_pairs = ";".join(vrai_pairs)
        final_comb.append(final_pairs)
    return final_comb
    return True

combinaison_train_F = verif_combinaison(combinaison_train)
combinaison_test_F = verif_combinaison(combinaison_test)
combinaison_soumission_F = verif_combinaison(combinaison_soumission)
#Y = verif_combinaison(X)

In [144]:
Y

['(Bruno Alves_Person,Lisbonne_Place);(Bruno Alves_PERSON,Bruno Alves_N/A)',
 '(Bruno Alves_Person,Lisbonne_Place);(Bruno Alves_PERSON,Bruno Alves_N/A)']

In [27]:
#combinaison_train[2]

#### ETAPE 3

In [16]:
#phrases_train=[]
with open ("/projects/melodi/mettaleb/Textmine/train.sent", "r") as f:
    phrases_train=f.readlines()
phrases_test=[]
with open ("/projects/melodi/mettaleb/Textmine/test.sent", "r") as f:
    phrases_test=f.readlines()
phrases_soumission=[]
with open ("/projects/melodi/mettaleb/Textmine/soumission.sent", "r") as f:
    phrases_soumission=f.readlines()


In [17]:
tup_train=[]
with open ("/projects/melodi/mettaleb/Textmine/train.tup", "r") as f:
    for ligne in f.readlines():
        tup_train.append(ligne.strip())
tup_test=[]
with open ("/projects/melodi/mettaleb/Textmine/test.tup", "r") as f:
    for ligne in f.readlines():
        tup_test.append(ligne.strip())

### Filtrage 1 : 
 **Vérifie si une paire d'entités est présente dans triplets et retourne le triplet sous la forme "e1 ; e2 ; relation" ou "e1 ; e2 ; PAS_DE_RELATION" si aucune relation n'est trouvée.**

In [18]:
def extract_pairs(combinaison):
    pairs = []
    combi = combinaison.strip("()").split(");(")
    for pair in combi:
        e1, e2 = pair.split(",")
        pairs.append((e1.strip(), e2.strip()))
    return pairs

def clean_entity(entity):
    return entity.split("_")[0]

def find_triplet_for_pair(pair, triplets):
    e1, e2 = pair
    e1_clean, e2_clean = clean_entity(e1), clean_entity(e2)
    
    triplet_list = triplets.split(" | ")
    
    for triplet in triplet_list:
        t_e1, t_e2, relation = map(str.strip, triplet.split(" ; "))
        if clean_entity(t_e1) == e1_clean and clean_entity(t_e2) == e2_clean:
            return f"{e1} ; {e2} ; {relation}"
    
    return f"{e1} ; {e2} ; PAS_DE_RELATION"

def process_combinaison_triplets(combinaison, triplets):
    pairs = extract_pairs(combinaison)
    results = []
    
    for pair in pairs:
        triplet = find_triplet_for_pair(pair, triplets)
        results.append(triplet)
    
    return results

result_triplets_train=[]
result_triplets_test=[]
for i in range(len(combinaison_train_F)):
    triplets = process_combinaison_triplets(combinaison_train_F[i], tup_train[i])
    result_triplets_train.append(triplets)

for i in range(len(combinaison_test_F)):
    triplets = process_combinaison_triplets(combinaison_test_F[i], tup_test[i])
    result_triplets_test.append(triplets)


In [135]:
#combinaison_test_F[:1]

### Filtrage 2 : Attention Scores
On calcul le score d'attention de chaque pair d'entitié on garde que les pair qui ont des score >= seuil

In [36]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_attentions=True)
#'(volés_THEFT,Europe_PLACE);(Bruno Alves_CIVILIAN,Lisbonne_PLACE)'
def transform_pair_to_entities(pairs):
    list_pair = pairs.split(";")
    entities=[]
    for pair in list_pair:
        e1,e2 = pair.replace("(","").replace(")","").split(",")
        e1 = e1.split("_")[0]
        e2 = e2.split("_")[0]
        entities.append(e1)
        entities.append(e2)
    return entities
def find_entity_indices(paragraph, entities, tokenizer):
    tokens = tokenizer.tokenize(paragraph)
    entity_indices = {}
    entities = transform_pair_to_entities(entities)
    for entity in entities:
        entity_tokens = tokenizer.tokenize(entity)
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i + len(entity_tokens)] == entity_tokens:
                entity_indices[entity] = (i, i + len(entity_tokens) - 1)
                break
    return entity_indices

def get_attention_score(attentions, idx1, idx2):
    total_attention = torch.stack(attentions).sum(dim=0)
    score = total_attention[:, :, idx1, idx2].mean().item()
    #score = total_attention[:,:, idx1, idx2].mean(dim=0).item()
    return score


entites_SA=[]
for paragraph, entities in zip(phrases_test, combinaison_test_F):
    inputs = tokenizer(paragraph, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    attentions = outputs.attentions


    entity_indices = find_entity_indices(paragraph, entities, tokenizer)
    
    ents=[]
    list_pairs = entities.split(";")
    for i, pair in enumerate(list_pairs):
        entity1,entity2 = pair.replace("(","").replace(")","").split(",")
        entity1 = entity1.split("_")[0]
        entity2 = entity2.split("_")[0]
        if entity1 in entity_indices and entity2 in entity_indices:
            idx1 = entity_indices[entity1][0]
            idx2 = entity_indices[entity2][0]
            score = get_attention_score(attentions, idx1, idx2)
            ents.append(f"({entity1} ; {entity2}): {score:.4f}")
        else:
            print(f"Entity '{entity1}' or '{entity2}' not found in paragraph.")
    entites_SA.append(ents)


In [60]:
#entites_SA[0]

**Filtrage : on diminue le nombre de triplet represente la relation "PAS_DE_RELATION"**

In [19]:
result_triplets_train_F = []
def filtrage_etiquette(l_triplet):
    result_triplets_F =[]
    for i in range(len(l_triplet)):
        trouve = False
        L = []
        cpt=0
        for j in range(len(l_triplet[i])):
            _,_,rel = l_triplet[i][j].split(' ; ')       
            if rel == "PAS_DE_RELATION" and not trouve:
                L.append(l_triplet[i][j])
                cpt+=1
                if cpt ==20:
                    trouve =True
            elif rel != "PAS_DE_RELATION":
                L.append(l_triplet[i][j])
        result_triplets_F.append(L)
    return result_triplets_F
result_triplets_train_F = filtrage_etiquette(result_triplets_train)
result_triplets_test_F = filtrage_etiquette(result_triplets_test)

In [20]:
freq = {}
for l_triplet in result_triplets_train_F:
    for triplet in l_triplet:
        _,_,rel = triplet.split(" ; ")
        if rel in freq :
            freq[rel] += 1
        else:
            freq[rel] = 1

In [100]:
print(freq)

{'PAS_DE_RELATION': 11997, 'IS_LOCATED_IN': 5651, 'IS_OF_SIZE': 320, 'HAS_CATEGORY': 745, 'STARTED_IN': 882, 'START_DATE': 612, 'GENDER_FEMALE': 300, 'OPERATES_IN': 1730, 'END_DATE': 162, 'HAS_COLOR': 68, 'HAS_QUANTITY': 128, 'GENDER_MALE': 625, 'IS_PART_OF': 952, 'IS_IN_CONTACT_WITH': 1997, 'CREATED': 90, 'INITIATED': 415, 'HAS_CONTROL_OVER': 2455, 'HAS_FOR_LENGTH': 11, 'HAS_CONSEQUENCE': 405, 'IS_OF_NATIONALITY': 110, 'IS_AT_ODDS_WITH': 1026, 'HAS_FOR_HEIGHT': 9, 'RESIDES_IN': 67, 'IS_BORN_IN': 21, 'IS_BORN_ON': 18, 'IS_COOPERATING_WITH': 197, 'INJURED_NUMBER': 57, 'HAS_FAMILY_RELATIONSHIP': 146, 'IS_DEAD_ON': 53, 'WAS_CREATED_IN': 4, 'IS_REGISTERED_AS': 26, 'DEATHS_NUMBER': 55, 'WAS_DISSOLVED_IN': 8, 'DIED_IN': 20}


In [21]:
print(freq)

{'PAS_DE_RELATION': 11997, 'IS_LOCATED_IN': 5651, 'IS_OF_SIZE': 320, 'HAS_CATEGORY': 745, 'STARTED_IN': 882, 'START_DATE': 612, 'GENDER_FEMALE': 300, 'OPERATES_IN': 1730, 'END_DATE': 162, 'HAS_COLOR': 68, 'HAS_QUANTITY': 128, 'GENDER_MALE': 625, 'IS_PART_OF': 952, 'IS_IN_CONTACT_WITH': 1997, 'CREATED': 90, 'INITIATED': 415, 'HAS_CONTROL_OVER': 2455, 'HAS_FOR_LENGTH': 11, 'HAS_CONSEQUENCE': 405, 'IS_OF_NATIONALITY': 110, 'IS_AT_ODDS_WITH': 1026, 'HAS_FOR_HEIGHT': 9, 'RESIDES_IN': 67, 'IS_BORN_IN': 21, 'IS_BORN_ON': 18, 'IS_COOPERATING_WITH': 197, 'INJURED_NUMBER': 57, 'HAS_FAMILY_RELATIONSHIP': 146, 'IS_DEAD_ON': 53, 'WAS_CREATED_IN': 4, 'IS_REGISTERED_AS': 26, 'DEATHS_NUMBER': 55, 'WAS_DISSOLVED_IN': 8, 'DIED_IN': 20}


In [53]:
len(result_triplets_train_F[0])

34

**Résultat to files**

In [65]:
print(Relations_Definition)

['Actor,Is_Located_In,Place', 'Event,Is_Located_In,Place', 'Place,Is_Located_In,Place', 'Actor,Is_of_Nationality,Nationality', 'Place,Is_of_Nationality,Nationality', 'Actor,Created,Organization', 'Actor,Has_Control_Over,Actor', 'Actor,Has_Control_Over,Material', 'Actor,Has_Control_Over,Place', 'Actor,Initiated,Event', 'Actor,Is_At_Odds_With,Actor', 'Actor,Is_Cooperating_With,Actor', 'Actor,Is_In_Contact_With,Actor', 'Actor,Is_Part_Of,Organization', 'Event,Death_Number,Quantity', 'Event,End_Date,Time', 'Event,Has_Consequence,Event', 'Event,Injured_Number,Quantity', 'Event,Start_Date,Time', 'Event,Started_In,Place', 'Material,Has_Color,Color', 'Material,Has_for_Height,Height', 'Material,Has_for_Length,Length', 'Material,Has_for_Width,Width', 'Material,Has_Quantity,Quantity', 'Material,Is_Registered_As,Material reference', 'Material,Weighs,Weight', 'Organization,Created_In,Time', 'Organization,Dissolved_In,Time', 'Organization,Is_Of_Size,Quantity', 'Organization,Operates_In,Place', 'Perso

In [21]:
def chercher_tous_relations_possibles(e1,e2,relations_definitions):
    relations_possibles=""
    class_e1 = e1.split("_",1)[1]
    class_e2 = e2.split("_",1)[1]
    #type_e1 = find_type_entity(class_e1).replace('MATERIEL',"MATERIAL")
    #type_e2 = find_type_entity(class_e2).replace('MATERIEL',"MATERIAL")
    type_e1 = class_e1
    type_e2 = class_e2
    #print(type_e1,type_e2)
    for relation in relations_definitions:
        entities = relation.split(',')
        if len(entities) == 3 and entities[0].lower() == type_e1.lower() and entities[2].lower() == type_e2.lower():
            relations_possibles += f"{entities[1]};"
    relations_possibles += "PAS_DE_RELATION\n"
    return relations_possibles

In [103]:
x = 'gouvernements_NON_MILITARY_GOVERNMENT_ORGANISATION ; pièces détachées_MATERIEL ; PAS_DE_RELATION'
e1,e2,rel = x.split(" ; ")
rel_possible = chercher_tous_relations_possibles(e1,e2,Relations_Definition)


Actor MATERIAL


In [104]:
rel_possible

'Has_Control_Over;PAS_DE_RELATION\n'

In [99]:
result_triplets_test[0][13]

'gouvernements_NON_MILITARY_GOVERNMENT_ORGANISATION ; pièces détachées_MATERIEL ; PAS_DE_RELATION'

In [128]:
result_triplets_train[0][:2]

['Ma passion_NON_GOVERNMENTAL_ORGANISATION ; accident de circulation_ACCIDENT ; PAS_DE_RELATION',
 'Anam Destresse_CIVILIAN ; Italie_PLACE ; IS_LOCATED_IN']

In [94]:
with open("/projects/melodi/mettaleb/Textmine/train_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/train_strategie3.tup", "w") as fich2, open("/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_train.sent", "w") as fich3:
    for i in range(len(result_triplets_train_F)):
        for j in range(len(result_triplets_train_F[i])):
            e1,e2,rel = result_triplets_train_F[i][j].split(" ; ")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            triplet = f"{e1} ; {e2} ; {rel}\n"
            fich2.write(triplet)
            fich1.write(f"{phrases_train[i]}")
            fich3.write(relations_possibles)

In [95]:
sent=[]
tupp=[]
with open("/projects/melodi/mettaleb/Textmine/test_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/test_strategie3.tup", "w") as fich2 , open("/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_test.sent", "w") as fich3:
    for i in range(len(result_triplets_test_F)):
        for j in range(len(result_triplets_test_F[i])):
            e1,e2,rel = result_triplets_test_F[i][j].split(" ; ")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            triplet = f"{e1} ; {e2} ; {rel}\n"
            fich2.write(triplet)
            fich1.write(f"{phrases_test[i]}")
            fich3.write(relations_possibles)
            tupp.append(triplet)
            sent.append(phrases_test[i])

In [112]:
len(sent), len(tupp)

(10299, 10299)

In [96]:
ids = df_test['id'].to_list()
with open("/projects/melodi/mettaleb/Textmine/soumission_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/soumission_pairs.sent", "w") as fich2, open("/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_soumission.sent", "w") as fich3:
    for i in range(len(combinaison_soumission_F)):
        result_pair_test = combinaison_soumission_F[i].split(";")
        for j in range(len(result_pair_test)):
            e1,e2 = result_pair_test[j].split(",")
            e1 = e1.replace("(","")
            e2 = e2.replace(")","")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            pair = f"{ids[i]} ; {e1} ; {e2}\n"
            fich2.write(pair)
            fich1.write(f"{phrases_soumission[i]}")
            fich3.write(relations_possibles)

# Pour Strategie 5 

In [22]:
df_train = pd.read_csv('/projects/melodi/mettaleb/Textmine//data/train.csv')
#df_train = df_train.set_index("id")
df_train.entities = df_train.entities.apply(json.loads)
df_train.relations = df_train.relations.apply(json.loads)
df_train.head()

Unnamed: 0,id,text,entities,relations
0,181,"Anam Destresse, président de l'ONG ""Ma passion...","[{'id': 0, 'mentions': [{'value': 'accident', ...","[[0, STARTED_IN, 9], [7, IS_LOCATED_IN, 9], [5..."
1,31669,"À Paris, le 8 avril 2022, l'usine de déodorant...","[{'id': 0, 'mentions': [{'value': 'explosé', '...","[[9, IS_LOCATED_IN, 8], [11, OPERATES_IN, 8], ..."
2,51470,"En Espagne, dans une région agricole, une cont...","[{'id': 0, 'mentions': [{'value': 'contaminati...","[[7, IS_PART_OF, 8], [9, OPERATES_IN, 1], [0, ..."
3,51332,Un important incendie a fait des ravages dans ...,"[{'id': 0, 'mentions': [{'value': 'incendie', ...","[[12, IS_IN_CONTACT_WITH, 5], [0, IS_LOCATED_I..."
4,1131,« Je coule » : onze heures après avoir envoyé ...,"[{'id': 0, 'mentions': [{'value': 'renversé', ...","[[9, IS_LOCATED_IN, 2], [0, START_DATE, 17], [..."


In [24]:
ids_train =  df_train['id'].to_list()[:600]
ids_test =  df_train['id'].to_list()[600:]

In [29]:
with open("/projects/melodi/mettaleb/Textmine/strategie5/train_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/strategie5/train_strategie3.tup", "w") as fich2, open("/projects/melodi/mettaleb/Textmine/strategie5/relation_possibles_strategie3_train.sent", "w") as fich3:
    for i in range(len(result_triplets_train_F)):
        for j in range(len(result_triplets_train_F[i])):
            e1,e2,rel = result_triplets_train_F[i][j].split(" ; ")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            triplet = f"{ids_train[i]} ; {e1} ; {e2} ; {rel}\n"
            fich2.write(triplet)
            fich1.write(f"{phrases_train[i]}")
            fich3.write(relations_possibles)
with open("/projects/melodi/mettaleb/Textmine/strategie5/test_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/strategie5/test_strategie3.tup", "w") as fich2 , open("/projects/melodi/mettaleb/Textmine/strategie5/relation_possibles_strategie3_test.sent", "w") as fich3:
    for i in range(len(result_triplets_test_F)):
        for j in range(len(result_triplets_test_F[i])):
            e1,e2,rel = result_triplets_test_F[i][j].split(" ; ")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            triplet = f"{ids_test[i]} ; {e1} ; {e2} ; {rel}\n"
            fich2.write(triplet)
            fich1.write(f"{phrases_test[i]}")
            fich3.write(relations_possibles)

ids = df_test['id'].to_list()
with open("/projects/melodi/mettaleb/Textmine/strategie5/soumission_strategie3.sent", "w") as fich1, open("/projects/melodi/mettaleb/Textmine/strategie5/soumission_pairs.sent", "w") as fich2, open("/projects/melodi/mettaleb/Textmine/strategie5/relation_possibles_strategie3_soumission.sent", "w") as fich3:
    for i in range(len(combinaison_soumission_F)):
        result_pair_test = combinaison_soumission_F[i].split(";")
        for j in range(len(result_pair_test)):
            e1,e2 = result_pair_test[j].split(",")
            e1 = e1.replace("(","")
            e2 = e2.replace(")","")
            relations_possibles = chercher_tous_relations_possibles(e1,e2,Relations_Definition)
            e1 = e1.split("_")[0]
            e2 = e2.split("_")[0]
            pair = f"{ids[i]} ; {e1} ; {e2}\n"
            fich2.write(pair)
            fich1.write(f"{phrases_soumission[i]}")
            fich3.write(relations_possibles)

In [None]:
          A_COMMENCÉ_EN       0.16      0.32      0.21        19
               A_COULEUR       0.75      1.00      0.86         3
      A_DES_CONSÉQUENCES       0.63      0.73      0.68        30
         A_UNE_CATEGORIE       0.75      0.67      0.71        18
   A_UNE_CE_CONTROLE_SUR       0.67      0.36      0.47       119
          A_UNE_QUANTITÉ       0.57      0.57      0.57         7
A_UNE_RELATION_FAMILIALE       0.00      0.00      0.00         2
            COOPÈRE_AVEC       0.00      0.00      0.00         2
                    CRÉÉ       0.00      0.00      0.00         1
           DATE_DE_DÉBUT       0.55      0.84      0.67        19
             DATE_DE_FIN       0.00      0.00      0.00         7
      EST_DE_NATIONALITÉ       0.40      0.50      0.44         4
           EST_DE_TAILLE       0.91      0.77      0.83        13
    EST_ENREGISTRÉ_COMME       1.00      0.50      0.67         2
     EST_EN_CONFLIT_AVEC       0.23      0.58      0.33        12
     EST_EN_CONTACT_AVEC       0.32      0.47      0.38        43
         EST_LOCALISÉ_EN       0.64      0.77      0.70       191
             EST_MORT_LE       0.00      0.00      0.00         1
                EST_NÉ_A       0.00      0.00      0.00         3
          FAIT_PARTIE_DE       0.50      0.45      0.48        22
           GENRE_FEMININ       0.80      0.89      0.84         9
          GENRE_MASCULIN       0.77      0.91      0.83        11
                  INITIÉ       0.00      0.00      0.00         2
                 MORT_EN       0.00      0.00      0.00         2
       NOMBRE_DE_BLESSÉS       0.33      1.00      0.50         2
         NOMBRE_DE_MORTS       1.00      0.33      0.50         3
                OPÈRE_EN       0.78      0.74      0.76        58
         PAS_DE_RELATION       0.66      0.60      0.63       394
             RÉSIDE_DANS       0.00      0.00      0.00         1

**Préparer un prompt pour chaque triplet**

In [98]:
print(freq)

{'IS_LOCATED_IN': 1887, 'PAS_DE_RELATION': 4000, 'HAS_CONSEQUENCE': 346, 'STARTED_IN': 230, 'HAS_CONTROL_OVER': 1129, 'HAS_QUANTITY': 63, 'IS_OF_NATIONALITY': 63, 'OPERATES_IN': 587, 'START_DATE': 252, 'HAS_CATEGORY': 238, 'GENDER_MALE': 170, 'IS_OF_SIZE': 115, 'IS_AT_ODDS_WITH': 134, 'IS_IN_CONTACT_WITH': 511, 'IS_PART_OF': 208, 'DEATHS_NUMBER': 18, 'GENDER_FEMALE': 97, 'END_DATE': 74, 'HAS_COLOR': 23, 'HAS_FAMILY_RELATIONSHIP': 33, 'INITIATED': 18, 'IS_COOPERATING_WITH': 22, 'INJURED_NUMBER': 12, 'IS_BORN_IN': 11, 'RESIDES_IN': 4, 'IS_DEAD_ON': 13, 'IS_REGISTERED_AS': 8, 'DIED_IN': 19, 'CREATED': 3, 'HAS_FOR_HEIGHT': 3, 'IS_BORN_ON': 2, 'WAS_DISSOLVED_IN': 3, 'WAS_CREATED_IN': 1, 'HAS_FOR_LENGTH': 2}


In [91]:
LL = 0
for k in freq_reference_train:
    LL += freq_reference_train[k]
LL

9007

In [None]:
rel

In [102]:
freq_reference_train = {'PAS_DE_RELATION': 5000, 'IS_LOCATED_IN': 2000, 'IS_OF_SIZE': 400, 'HAS_CATEGORY': 500, 'STARTED_IN': 400, 'START_DATE': 250, 'GENDER_FEMALE': 300, 'OPERATES_IN': 700, 'END_DATE': 100, 'HAS_COLOR': 68, 'HAS_QUANTITY': 100, 'GENDER_MALE': 300, 'IS_PART_OF': 700, 'IS_IN_CONTACT_WITH': 1000, 'CREATED': 90, 'INITIATED': 400, 'HAS_CONTROL_OVER': 1000, 'HAS_FOR_LENGTH': 11, 'HAS_CONSEQUENCE': 300, 'IS_OF_NATIONALITY': 100, 'IS_AT_ODDS_WITH': 400, 'HAS_FOR_HEIGHT': 9, 'RESIDES_IN': 67, 'IS_BORN_IN': 21, 'IS_BORN_ON': 18, 'IS_COOPERATING_WITH': 450, 'INJURED_NUMBER': 57, 'HAS_FAMILY_RELATIONSHIP': 200, 'IS_DEAD_ON': 53, 'WAS_CREATED_IN': 100,  'IS_REGISTERED_AS': 26, 'DEATHS_NUMBER': 55, 'WAS_DISSOLVED_IN': 50 , 'DIED_IN': 20}
freq_reference_test  = {'PAS_DE_RELATION': 500, 'IS_LOCATED_IN': 300, 'IS_OF_SIZE': 100, 'HAS_CATEGORY': 100, 'STARTED_IN': 100, 'START_DATE': 100, 'GENDER_FEMALE': 100, 'OPERATES_IN': 100, 'END_DATE': 100, 'HAS_COLOR': 68, 'HAS_QUANTITY': 100, 'GENDER_MALE': 100, 'IS_PART_OF': 100, 'IS_IN_CONTACT_WITH': 100, 'CREATED': 90, 'INITIATED': 100, 'HAS_CONTROL_OVER': 100, 'HAS_FOR_LENGTH': 11, 'HAS_CONSEQUENCE': 100, 'IS_OF_NATIONALITY': 100, 'IS_AT_ODDS_WITH': 100, 'HAS_FOR_HEIGHT': 9, 'RESIDES_IN': 67, 'IS_BORN_IN': 21, 'IS_BORN_ON': 18, 'IS_COOPERATING_WITH': 100, 'INJURED_NUMBER': 57, 'HAS_FAMILY_RELATIONSHIP': 100, 'IS_DEAD_ON': 53, 'WAS_CREATED_IN': 50, 'IS_REGISTERED_AS': 26, 'DEATHS_NUMBER': 55, 'WAS_DISSOLVED_IN': 50 , 'DIED_IN': 20}
freq_train_initial = {k:0 for k in freq_reference_train}
freq_test_initial = {k:0 for k in freq_reference_test}

def choisi_instance(liste_tuple, freq_train_initial, freq_reference_train):
    final_indices = []
    indices = random.sample(range(len(liste_tuple)),len(liste_tuple))
    for indice in indices:
        tuples_fr =[]
        for tupl in liste_tuple[indice]:
            e1, e2, rel = tupl.split(" ; ")
            rel = rel.strip()
            if freq_train_initial[rel] <= freq_reference_train[rel]:
                freq_train_initial[rel]+=1
                final_indices.append(indice)
    return final_indices

with open('/projects/melodi/mettaleb/Textmine/train_strategie3.tup') as f:
    tuples_list_train = [s.split(' | ') for s in f.readlines()]
with open('/projects/melodi/mettaleb/Textmine/test_strategie3.tup') as f:
    tuples_list_test = [s.split(' | ') for s in f.readlines()]
final_indices_train = choisi_instance(tuples_list_train, freq_train_initial, freq_reference_train)
final_indices_test = choisi_instance(tuples_list_test, freq_test_initial, freq_reference_test)

In [30]:
max(final_indices_train)

31354

In [5]:
len(final_indices_test)

2405

In [59]:
with open('/projects/melodi/mettaleb/Textmine/train_strategie3.tup') as f :
    relationst = f.readlines()
    relationst = [x.strip().split(" ; ")[2] for x in relationst]
    relationst = list(set(relationst))

In [53]:
#relationst = [x.split(" ; ")[2].strip() for x in relationst]

In [None]:
relationst = 

In [42]:
relat = list(set(relat))
len(relat)

37

In [60]:
for r in relat : 
    if r not in relationst:
        print(r)

HAS_LONGITUDE
HAS_LATITUDE
WEIGHS
HAS_FOR_WIDTH


In [44]:
relationst = list(set(relationst))

In [57]:
len(relationst)

34

In [1]:
PHRASE  = "Lisbonne ; Europe ; IS_LOCATED_IN | vendus ; quartiers ; STARTED_IN | crise financière ; Europe ; IS_LOCATED_IN | médicaments ; certains ; HAS_QUANTITY | crise financière ; vendus ; HAS_CONSEQUENCE | gouvernement ; quartiers ; OPERATES_IN | gouvernement ; portugais ; IS_OF_NATIONALITY | crise économique ; Europe ; IS_LOCATED_IN | vendus ; Lisbonne ; IS_LOCATED_IN | volés ; Europe ; IS_LOCATED_IN | volés ; vendus ; HAS_CONSEQUENCE | Bruno Alves ; Europe ; IS_LOCATED_IN | augmenté ; vendus ; HAS_CONSEQUENCE | volés ; quartiers ; STARTED_IN | crise financière ; Lisbonne ; IS_LOCATED_IN | crise financière ; volés ; HAS_CONSEQUENCE | Bruno Alves ; Bruno Alves ; GENDER_MALE | augmenté ; Lisbonne ; IS_LOCATED_IN | vendus ; quartiers ; IS_LOCATED_IN | volés ; Lisbonne ; IS_LOCATED_IN | crise financière ; Europe ; STARTED_IN | crise financière ; plusieurs mois ; START_DATE | augmenté ; volés ; HAS_CONSEQUENCE | Bruno Alves ; sociologue ; HAS_CATEGORY | crise financière ; quartiers ; IS_LOCATED_IN | crise financière ; crise économique ; HAS_CONSEQUENCE | quartiers ; Lisbonne ; IS_LOCATED_IN | vendus ; Europe ; IS_LOCATED_IN | gouvernement ; Lisbonne ; OPERATES_IN | augmenté ; quartiers ; IS_LOCATED_IN | gouvernements ; Europe ; OPERATES_IN | Bruno Alves ; revue ; HAS_CONTROL_OVER | volés ; quartiers ; IS_LOCATED_IN"

In [2]:
len(PHRASE.split(" | "))

33

In [105]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, rel_possibles, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"""Quelle est la relation entre «{tup[0]}» et «{tup[1]}» dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ {rel_possibles} ].""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, rel_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
        #indices = random.sample(range(len(sentences)), 5000)
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    with open(rel_file) as f:
        relations = [s.strip() for s in f.readlines()]
    # Traduit les relations en francais 
    
    ###############################################################
    tuples_list_fr = []
    for tuples in tuples_list:
        tuples_fr =[]
        for tupl in tuples:
            e1, e2, rel = tupl.split(" ; ")
            rel = rel.strip().upper()
            rel_fr = relations_trad[rel]
            new_tuple = f"{e1} ; {e2} ; {rel_fr}"
            tuples_fr.append(new_tuple)
        tuples_list_fr.append(tuples_fr)
    tuples_list = tuples_list_fr
    ####################
    relations_fr = []
    for rel in relations :
        lis_rel = rel.split(";")
        list_rel_fr =[]
        for i in range(len(lis_rel)):
            rel = lis_rel[i].strip().upper()
            rel_fr = relations_trad[rel]
            list_rel_fr.append(rel_fr)
        rel_poss_fr = " ; ".join(list_rel_fr)
        relations_fr.append(rel_poss_fr)
    relations = relations_fr
    ###############################################################
    
    indices = final_indices_test
    sentences = [sentences[i] for i in indices]
    tuples_list = [tuples_list[i] for i in indices]
    relations = [relations[i] for i in indices]
    for sent, tuples, rel_possibles in zip(sentences, tuples_list, relations):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        rel_possibles = " ; ".join(list(set(rel_possibles.split(" ; "))))
        ins, i, o = get_instruction(sent, tuples, rel_possibles, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
    print("exemple Instruciton: ",instructions[10])    
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [104]:
train_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/train_strategie3.sent', '/projects/melodi/mettaleb/Textmine/train_strategie3.tup', '/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_train.sent', with_orig=False, with_cls=True)

longuer =  14736
exemple :  ['PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'COOPÈRE_AVEC', 'EST_LOCALISÉ_EN', 'OPÈRE_EN', 'DATE_DE_DÉBUT', 'OPÈRE_EN']
exemple Instruciton:  Quelle est la relation entre «responsable» et «représentants» dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ PAS_DE_RELATION ; A_UNE_CE_CONTROLE_SUR ; EST_EN_CONFLIT_AVEC ; EST_EN_CONTACT_AVEC ; COOPÈRE_AVEC ].


In [106]:
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/test_strategie3.sent', '/projects/melodi/mettaleb/Textmine/test_strategie3.tup','/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_test.sent', with_orig=False, with_cls=True)

longuer =  2405
exemple :  ['GENRE_MASCULIN', 'GENRE_MASCULIN', 'EST_DE_TAILLE', 'PAS_DE_RELATION', 'EST_LOCALISÉ_EN', 'A_UNE_CE_CONTROLE_SUR', 'PAS_DE_RELATION', 'A_DES_CONSÉQUENCES', 'EST_LOCALISÉ_EN', 'A_UNE_CATEGORIE']
exemple Instruciton:  Quelle est la relation entre «Pétrole pour tous» et «locaux» dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ EST_LOCALISÉ_EN ; A_UNE_CE_CONTROLE_SUR ; PAS_DE_RELATION ].


In [107]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 14736/14736 [00:00<00:00, 112237.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2405/2405 [00:00<00:00, 79298.32 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 14736
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 2405
    })
})

#### Soumission Data

In [24]:
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/soumission_strategie3.sent', '/projects/melodi/mettaleb/Textmine/soumission_pairs_fake.tup','/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_soumission.sent', with_orig=False, with_cls=True)


longuer =  73693
exemple :  ['PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'PAS_DE_RELATION']
exemple Instruciton:  Quelle est la relation entre «FEAR» et «avion commercial» dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ PAS_DE_RELATION ; A_UNE_CE_CONTROLE_SUR ].


In [None]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 2405/2405 [00:00<00:00, 90332.96 examples/s]
Saving the dataset (0/1 shards):   0%|          | 0/73693 [00:00<?, ? examples/s]

#### Version with entity:type

In [42]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, rel_possibles, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            e1, type_e1 = tup[0].split("_")
            e2, type_e2 = tup[1].split("_")
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"""Quelle est la relation entre les deux entités «{e1}» (de type «{type_e1}» )  et «{e2}» (de type «{type_e2}» ) dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ {rel_possibles} ].""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, rel_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
        #indices = random.sample(range(len(sentences)), 5000)
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    with open(rel_file) as f:
        relations = [s.strip() for s in f.readlines()]
    # Traduit les relations en francais 
    
    ###############################################################
    tuples_list_fr = []
    for tuples in tuples_list:
        tuples_fr =[]
        for tupl in tuples:
            e1, e2, rel = tupl.split(" ; ")
            rel = rel.strip().upper().replace('DEATHS_NUMBER', "DEATH_NUMBER").replace('HAS_GENDER_FEMALE', "GENDER_FEMALE").replace('HAS_GENDER_MALE', "GENDER_MALE").replace('WAS_CREATED_IN', "CREATED_IN")
            rel = rel.replace('WAS_DISSOLVED_IN', "DISSOLVED_IN")
            rel_fr = relations_trad[rel]
            new_tuple = f"{e1} ; {e2} ; {rel_fr}"
            tuples_fr.append(new_tuple)
        tuples_list_fr.append(tuples_fr)
    tuples_list = tuples_list_fr
    ####################
    relations_fr = []
    for rel in relations :
        lis_rel = rel.split(";")
        list_rel_fr =[]
        for i in range(len(lis_rel)):
            rel = lis_rel[i].strip().upper().replace('DEATHS_NUMBER', "DEATH_NUMBER").replace('HAS_GENDER_FEMALE', "GENDER_FEMALE").replace('HAS_GENDER_MALE', "GENDER_MALE").replace('WAS_CREATED_IN', "CREATED_IN")
            rel = rel.replace('WAS_DISSOLVED_IN', "DISSOLVED_IN")
            rel_fr = relations_trad[rel]
            list_rel_fr.append(rel_fr)
        rel_poss_fr = " ; ".join(list_rel_fr)
        relations_fr.append(rel_poss_fr)
    relations = relations_fr
    ###############################################################
    
    indices = final_indices_train
    sentences = [sentences[i] for i in indices]
    tuples_list = [tuples_list[i] for i in indices]
    relations = [relations[i] for i in indices]
    for sent, tuples, rel_possibles in zip(sentences, tuples_list, relations):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        rel_possibles = " ; ".join(list(set(rel_possibles.split(" ; "))))
        ins, i, o = get_instruction(sent, tuples, rel_possibles, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
    print("exemple Instruciton: ",instructions[10])    
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [43]:
train_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/train_strategie3.sent', '/projects/melodi/mettaleb/Textmine/train_strategie3_entity_type.tup', '/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_train.sent', with_orig=False, with_cls=True)

longuer =  9026
exemple :  ['A_COMMENCÉ_EN', 'EST_EN_CONTACT_AVEC', 'PAS_DE_RELATION', 'EST_EN_CONTACT_AVEC', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'A_UNE_CATEGORIE', 'EST_LOCALISÉ_EN', 'OPÈRE_EN', 'CRÉÉ']
exemple Instruciton:  Quelle est la relation entre les deux entités «individus» (de type «Actor» )  et «disputaient» (de type «Event» ) dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ PAS_DE_RELATION ; INITIÉ ].


In [41]:
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/test_strategie3.sent', '/projects/melodi/mettaleb/Textmine/test_strategie3_entity_type.tup','/projects/melodi/mettaleb/Textmine/relation_possibles_strategie3_test.sent', with_orig=False, with_cls=True)

longuer =  2405
exemple :  ['A_UNE_CE_CONTROLE_SUR', 'PAS_DE_RELATION', 'EST_LOCALISÉ_EN', 'EST_DE_TAILLE', 'A_UNE_CE_CONTROLE_SUR', 'PAS_DE_RELATION', 'PAS_DE_RELATION', 'A_UNE_CE_CONTROLE_SUR', 'PAS_DE_RELATION', 'OPÈRE_EN']
exemple Instruciton:  Quelle est la relation entre les deux entités «explosion» (de type «Event» )  et «brûlé» (de type «Event» ) dans le contexte du texte d'entrée. Choisissez une réponse parmi : [ A_DES_CONSÉQUENCES ; PAS_DE_RELATION ].


In [44]:

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (1/1 shards): 100%|██████████| 9026/9026 [00:00<00:00, 102385.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2405/2405 [00:00<00:00, 64341.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 9026
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 2405
    })
})

## Strategie 4 

### 4.1  Modele de classification binaire 

In [120]:
#with open('mydata/relations.txt') as f:
#relations = [r.strip() for r in set(df_train['relation'].to_list())]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given a paragraph that describes the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Options: {', '.join(relations)}. ")
        instructions.append(f"Given the input paragraph, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations include: {'; '.join(relations)}.")
        instructions.append(f"Provide a paragraph containing relationships between entities. Extract and identify the specific relations between entity pairs mentioned in the paragraph. Output the results in the format \"{{relation1: word1, word2}}; {{relation2: word3, word4}}\". Relations should be determined based on the context provided in the paragraph. Relations include: {'; '.join(relations)}.")

        inputs.extend([sent] * 3)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 3)
    
    if with_cls:
        for tup in tuples:
            #instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"""Analyser le texte suivant et déterminer s'il existe une relation directe entre les deux entités spécifiées. Répondez uniquement par OUI s’il existe une relation et par NON sinon.\nEntité 1 : {tup[0]}.\nEntité 2 : {tup[1]}""")
            inputs.extend([sent] * 1)
            outputs.extend([tup[-1]] * 1)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
        #indices = random.sample(range(len(sentences)), 5000)
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
    
    ###############################################################
    tuples_list_fr = []
    for tuples in tuples_list:
        tuples_fr =[]
        for tupl in tuples:
            e1, e2, rel = tupl.split(" ; ")
            if "PAS_DE_RELATION" in rel:
                rel_fr = "NON"
            else:
                rel_fr = "OUI"
            new_tuple = f"{e1} ; {e2} ; {rel_fr}"
            tuples_fr.append(new_tuple)
        tuples_list_fr.append(tuples_fr)
    tuples_list = tuples_list_fr
    ###############################################################

    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
    print("exemple Instruciton: ",instructions[10])    
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [121]:
train_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/train_strategie3.sent', '/projects/melodi/mettaleb/Textmine/train_strategie3.tup', with_orig=False, with_cls=True)

longuer =  31362
exemple :  ['NON', 'OUI', 'NON', 'NON', 'NON', 'NON', 'NON', 'OUI', 'NON', 'OUI']
exemple Instruciton:  Analyser le texte suivant et déterminer s'il existe une relation directe entre les deux entités spécifiées. Répondez uniquement par OUI s’il existe une relation et par NON sinon.
Entité 1 : hôpital.
Entité 2 : Italie


In [None]:
test_dataset = get_finred_dataset('/projects/melodi/mettaleb/Textmine/test_strategie3.sent', '/projects/melodi/mettaleb/Textmine/test_strategie3.tup', with_orig=False, with_cls=True)

In [9]:
train_dataset = get_finred_dataset('mydata/train.sent', 'mydata/train.tup', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('mydata/test.sent', 'mydata/test.tup', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

longuer =  4000
exemple :  ['shareholder_of: Badgeville, Norwest Venture Partners', 'competitor_of: Bioverativ Inc., Baxalta', 'client_of: Baton Rouge Southern Railroad, Kansas City Southern', 'shareholder_of: PEG Africa Ltd., Blue Haven Initiative', 'product_or_service_of: Lewis Galoob Toys, Inc., Micro Machines', 'product_or_service_of: Blue-Tongue Films, The Gift', 'collaboration: Brightmail Inc., Excite', 'acquired_by: Elbit Systems Ltd., OIP Sensor Systems', 'client_of: Vision Crew Unlimited, Coca-Cola', 'subsidiary_of: United Business Media, UBM Technology Group']
longuer =  708
exemple :  ['undefined: Winnie, Inc., Android', 'product_or_service_of: Marker International, Duplex', 'shareholder_of: Stone & Wood Brewing Co., Lion', 'collaboration: Bell Canada, Cellport Systems, Inc.', 'subsidiary_of: Global Electronic Trading Company, GETCO', 'subsidiary_of: Caspian Drilling Company, State Oil Company of Azerbaijan Republic', 'client_of: oeFun, Wii', 'product_or_service_of: Roja, Aa

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/708 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 708
    })
})

In [None]:
all_relations

## CLS Ver. for Zero-shot Training

In [7]:
#with open('mydata/relations.txt') as f:
 #   all_relations = [r.strip() for r in f.readlines()]
all_relations = [r.strip() for r in set(df_train['relation'].to_list())]


def get_instruction(sent, tuples):
    
    instructions, inputs, outputs = [], [], []
    for tup in tuples:        
        #output = tup[-1].replace('_', ' ').replace(' / ', '/').replace(' or ', '/')
        output = tup[-1]
        relations = all_relations.copy()
        if output in relations:
            relations.remove(output)
        random.shuffle(relations)
        relations = relations[:3] + [output]
        random.shuffle(relations)
        instructions.append(f"Utilize the input text as a context reference, choose the right relationship between '{tup[0]}' and '{tup[1]}' from the options.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Refer to the input text as context and select the correct relationship between '{tup[0]}' and '{tup[1]}' from the available options.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Take context from the input text and decide on the accurate relationship between '{tup[0]}' and '{tup[1]}' from the options provided.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"What is the relationship between '{tup[0]}' and '{tup[1]}' in the context of the input sentence.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"In the context of the input sentence, determine the relationship between '{tup[0]}' and '{tup[1]}'.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Analyze the relationship between '{tup[0]}' and '{tup[1]}' within the context of the input sentence.\nOptions: {', '.join(relations)}")
        inputs.extend([sent] * 6)
        outputs.extend([output] * 6)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file):
    
    random.seed(0)
    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })

In [28]:
train_dataset_instruct = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup')
test_dataset_instruct = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup')

finred_dataset_instruct = DatasetDict({
    'train': train_dataset_instruct,
    'test': test_dataset_instruct
})

finred_dataset_instruct.save_to_disk('fingpt-finred-cls-instruct')
finred_dataset_instruct

FileNotFoundError: [Errno 2] No such file or directory: 'FinRED/train.sent'

In [52]:
finred_dataset_instruct['train']['instruction'][:10]

["Utilize the input text as a context reference, choose the right relationship between 'Apple Inc' and 'Steve Jobs' from the options.\nOptions: industry, founded by, owner of, currency",
 "Refer to the input text as context and select the correct relationship between 'Apple Inc' and 'Steve Jobs' from the available options.\nOptions: industry, currency, owner of, founded by",
 "Take context from the input text and decide on the accurate relationship between 'Apple Inc' and 'Steve Jobs' from the options provided.\nOptions: industry, currency, owner of, founded by",
 "What is the relationship between 'Apple Inc' and 'Steve Jobs' in the context of the input sentence.\nOptions: currency, founded by, owner of, industry",
 "In the context of the input sentence, determine the relationship between 'Apple Inc' and 'Steve Jobs'.\nOptions: industry, founded by, owner of, currency",
 "Analyze the relationship between 'Apple Inc' and 'Steve Jobs' within the context of the input sentence.\nOptions: c

# Prepare my data :CLS Ver. for Zero-shot Training

In [8]:
train_dataset_instruct = get_finred_dataset('mydata/train.sent', 'mydata/train.tup')
test_dataset_instruct = get_finred_dataset('mydata/test.sent', 'mydata/test.tup')

finred_dataset_instruct = DatasetDict({
    'train': train_dataset_instruct,
    'test': test_dataset_instruct
})

finred_dataset_instruct.save_to_disk('fingpt-finred-cls-instruct')
finred_dataset_instruct

Saving the dataset (0/1 shards):   0%|          | 0/24000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4248 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 4248
    })
})

In [10]:
finred_dataset_instruct['train']['output'][:10]

['shareholder of',
 'shareholder of',
 'shareholder of',
 'shareholder of',
 'shareholder of',
 'shareholder of',
 'competitor of',
 'competitor of',
 'competitor of',
 'competitor of']

In [54]:
doc_dict, question_dict = {}, {}
for i, row in docs.iterrows():
    doc_dict[row['docid']] = row['doc']
for i, row in questions.iterrows():
    question_dict[row['qid']] = row['question']
    
instruction_templates = [
    "Utilize your financial knowledge, give your answer or opinion to the input question or subject . Answer format is not limited.",
    "Offer your insights or judgment on the input financial query or topic using your financial expertise. Reply as normal question answering",
    "Based on your financial expertise, provide your response or viewpoint on the given financial question or topic. The response format is open.",
    "Share your insights or perspective on the financial matter presented in the input.",
    "Offer your thoughts or opinion on the input financial query or topic using your financial background."
]

inputs, outputs, instructions = [], [], []
for i, row in qa_pairs.iterrows():
    qid, docid = row['qid'], row['docid']
    q = str(question_dict[qid])
    doc = str(doc_dict[docid])
    inputs.append(q)
    outputs.append(doc)
    instructions.append(instruction_templates[i%5])

fiqa_qa_dataset = Dataset.from_dict({
    'input': inputs,
    'output': outputs,
    'instruction': instructions
})
fiqa_qa_dataset.save_to_disk('fingpt-fiqa_qa')
fiqa_qa_dataset

Saving the dataset (0/1 shards):   0%|          | 0/17110 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 17110
})

In [4]:
pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyarrow.parquet as pq


In [2]:
table_parquet = pq.read_table("train.parquet")

dataframe_parquet = table_parquet.to_pandas()



In [4]:
dataframe_parquet.shape

(27558, 3)