In [1]:
import json
import re
import os
import random
import datasets
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig

  from .autonotebook import tqdm as notebook_tqdm


# NER

In [28]:
# Read and parse the CoNLL-2003 formatted dataset

ent_dict = {
    'PER': 'person',
    'ORG': 'organization',
    'LOC': 'location',
}

def read_conll_file(file_path):
    sentences, tokens, labels = [], [], []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.strip().split()
                token, label = parts[0], parts[-1]
                tokens.append(token)
                labels.append(label)
                sentence.append((token, label))
                
    return sentences
                

def get_ner_dataset(sentences):
    
    inputs, outputs, instructions = [], [], []
    count = {'person': 0, 'organization': 0, 'location': 0}
    for sentence in sentences:
        is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
        if sum(is_entity) == 0:
            continue
        instructions.append('Please extract entities and their types from the input sentence, entity types should be chosen from {person/organization/location}.')
        inputs.append(' '.join([tup[0] for tup in sentence]))
        outputs.append('')
        tmp_tup_list = []
        for i, tup in enumerate(sentence):
            if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
                entity = ' '.join([t[0] for t in tmp_tup_list])
                assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
                entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
                a = 'an' if entity_type == 'organization' else 'a'
                outputs[-1] += f'{entity} is {a} {entity_type}, ' 
                tmp_tup_list = [] if not is_entity[i] else [tup]
                count[entity_type] += 1
            elif is_entity[i]:
                tmp_tup_list.append(tup)
            else:
                pass
        outputs[-1] = outputs[-1].strip(', ') + '.'
    
    print(len(instructions))
    print(count)
        
    return {"input": inputs, "output": outputs, "instruction": instructions}

In [29]:
train_data = read_conll_file('./SEC-filings/CONLL-format/data/train/FIN5.txt')
test_data = read_conll_file('./SEC-filings/CONLL-format/data/test/FIN3.txt')

train_data = get_ner_dataset(train_data)
test_data = get_ner_dataset(test_data)

ner_dataset = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'test': Dataset.from_dict(test_data)
})
ner_dataset.save_to_disk('fingpt-ner')
ner_dataset

511
{'person': 745, 'organization': 243, 'location': 168}
98
{'person': 216, 'organization': 56, 'location': 39}


Saving the dataset (0/1 shards):   0%|          | 0/511 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/98 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 511
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 98
    })
})

# FinRED

In [47]:
with open('FinRED/relations.txt') as f:
    relations = [r.strip() for r in f.readlines()]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given phrases that describe the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"relation1: word1, word2; relation2: word3, word4\". Options: {', '.join(relations)}.")
        instructions.append(f"Given the input sentence, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"relation1: word1, word2; relation2: word3, word4\". Relations include: {'; '.join(relations)}.")
        inputs.extend([sent] * 2)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 2)
    
    if with_cls:
        for tup in tuples:
            instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            inputs.extend([sent] * 2)
            outputs.extend([tup[-1]] * 2)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [48]:
train_dataset = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup', with_orig=True, with_cls=True)
test_dataset = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup', with_orig=True, with_cls=True)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (0/1 shards):   0%|          | 0/27558 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5112 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 27558
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 5112
    })
})

In [None]:
train_dataset = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

# Prepare my data finRED
## Jointly

In [2]:
import pandas as pd
import random
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    processed_words = []
    text = text.replace('\n', '.')
    text = text.replace('\xa0', ' ')
    text = text.replace("\\xa"," ")
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'\/.*?\/', ' ', text)
    text = text.replace('\\', '')
    text = ' '.join(re.findall(r'\w+', text))
    doc = nlp(text)
    for token in doc:
        if not token.is_space and not token.is_punct:
            lemma = token.lemma_
            if len(lemma)>1:
                processed_words.append(lemma.lower())
    processed_text = ' '.join(processed_words)
    return processed_text

In [6]:
trainb = pd.read_csv("/users/melodi/mettaleb/ECLADATA/BizRel-Hadjer/Bert_re_Hadjer/data/bizrel/train_bizrel_luke.tsv", sep='\t') 
testb = pd.read_csv("/users/melodi/mettaleb/ECLADATA/BizRel-Hadjer/Bert_re_Hadjer/data/bizrel/test_bizrel_luke.tsv", sep='\t') 
devb =  pd.read_csv("/users/melodi/mettaleb/ECLADATA/BizRel-Hadjer/Bert_re_Hadjer/data/bizrel/dev_bizrel_luke.tsv", sep='\t')
df = pd.concat([trainb,devb,testb])
mapping = {
    0: 'negative',
    1: 'Investment',
    2: 'Sale-Purchase',
    3: 'Competition',
    4: 'Partnership',
    5: 'Legal-proceeding'}

df['label'] = df['relation'].map(mapping)
df.head()

Unnamed: 0,sentence,relation,org,actor1,s1,actor2,s2,token,subj_start,subj_end,obj_start,obj_end,label
0,"Video : [E11] Thales [E12] , [E21] Intelsat [E...",0,"Video : # ORG # , $ ORG $ , Bombardier , Etiha...",Thales Group,3.396017,Intelsat,11.42792,"['Video', ':', 'Thales', ',', 'Intelsat', ',',...",2,2,4,4,negative
1,"On the fuselage , [E11] Boeing [E12] has taken...",0,"On the fuselage , # ORG # has taken advantage ...",Boeing,11.025807,Beechcraft Premier I,0.102741,"['On', 'the', 'fuselage', ',', 'Boeing', 'has'...",4,4,20,20,negative
2,"[E11] NASA [E12] , New Stability , Samsung , ...",0,"# ORG # , New Stability , Samsung , Goal , Se...",NASA,7.224445,Volvo Cars,0.005962,"['NASA', ',', 'New', 'Stability', ',', 'Samsun...",0,0,12,12,negative
3,[E11] Thales [E12] s primary competitors are ...,3,"# ORG # s primary competitors are $ ORG $ , S...",Thales Group,3.755883,Rockwell Collins,11.022833,"['Thales', 's', 'primary', 'competitors', 'are...",0,0,5,6,Competition
4,The traditional large pharmaceutical and consu...,0,The traditional large pharmaceutical and consu...,Bayer,10.955735,GlaxoSmithKline,12.11345,"['The', 'traditional', 'large', 'pharmaceutica...",22,22,24,24,negative


In [7]:
def remplacer_balises(phrase):
    phrase = phrase.replace("[E11]", "").replace("[E12]", "").replace("[E21]", "").replace("[E22]", "")
    return preprocess_text(phrase)
def remplacer_actor1(phrase):
    actor1 = phrase.split("[E11]")[1].split("[E12]")[0].strip()
    return preprocess_text(actor1)
def remplacer_actor2(phrase):
    actor2 = phrase.split("[E21]")[1].split("[E22]")[0].strip()
    return preprocess_text(actor2)



# Appliquer la fonction à la colonne "texte"
df["actor1"] = df["sentence"].apply(remplacer_actor1)
df["actor2"] = df["sentence"].apply(remplacer_actor2)
df["text"] = df["sentence"].apply(remplacer_balises)

In [8]:
df.head()

Unnamed: 0,sentence,relation,org,actor1,s1,actor2,s2,token,subj_start,subj_end,obj_start,obj_end,label,text
0,"Video : [E11] Thales [E12] , [E21] Intelsat [E...",0,"Video : # ORG # , $ ORG $ , Bombardier , Etiha...",thale,3.396017,intelsat,11.42792,"['Video', ':', 'Thales', ',', 'Intelsat', ',',...",2,2,4,4,negative,video thales intelsat bombardier etihad hmgaer...
1,"On the fuselage , [E11] Boeing [E12] has taken...",0,"On the fuselage , # ORG # has taken advantage ...",boeing,11.025807,premier,0.102741,"['On', 'the', 'fuselage', ',', 'Boeing', 'has'...",4,4,20,20,negative,on the fuselage boeing have take advantage of ...
2,"[E11] NASA [E12] , New Stability , Samsung , ...",0,"# ORG # , New Stability , Samsung , Goal , Se...",nasa,7.224445,volvo,0.005962,"['NASA', ',', 'New', 'Stability', ',', 'Samsun...",0,0,12,12,negative,nasa new stability samsung goal seen physique ...
3,[E11] Thales [E12] s primary competitors are ...,3,"# ORG # s primary competitors are $ ORG $ , S...",thale,3.755883,rockwell collins,11.022833,"['Thales', 's', 'primary', 'competitors', 'are...",0,0,5,6,Competition,thale primary competitor be rockwell collins s...
4,The traditional large pharmaceutical and consu...,0,The traditional large pharmaceutical and consu...,bayer,10.955735,glaxosmithkline,12.11345,"['The', 'traditional', 'large', 'pharmaceutica...",22,22,24,24,negative,the traditional large pharmaceutical and consu...


In [9]:

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df[["text","actor1","actor2","label"]], test_size=0.8, stratify=df['label'], random_state=42)

print("Distribution des classes dans l'ensemble d'entraînement :\n", train_df['label'].value_counts())
print("Distribution des classes dans l'ensemble de test :\n", test_df['label'].value_counts())


#train_df.to_csv('trainBizRel.csv', index=False)
#test_df.to_csv('testBizRel.csv', index=False)

Distribution des classes dans l'ensemble d'entraînement :
 negative            1328
Competition          394
Partnership          148
Investment            66
Sale-Purchase         58
Legal-proceeding      12
Name: label, dtype: int64
Distribution des classes dans l'ensemble de test :
 negative            5316
Competition         1577
Partnership          590
Investment           265
Sale-Purchase        234
Legal-proceeding      46
Name: label, dtype: int64


In [10]:
def generer_entiers_aleatoires(n):
    liste_entiers = []
    for _ in range(n):
        entier = random.randint(0, n // 2)
        liste_entiers.append(entier)
    return liste_entiers
l=test_df['text'].to_list()
with open("test.sent", "w") as fsent, open("test.dep", "w") as fichier:
    for sentence in l:
        sentence = sentence.strip()
        #sentence = preprocess_text(sentence)
        fsent.write(sentence+"\n")
        n = len(sentence.split())
        resultat = generer_entiers_aleatoires(n)
        data = '{{"adj_mat": [{}]}}\n'.format(resultat)
        fichier.write(data)

In [21]:
mapping = {
    'negative':"undefined",
    'Investment':"shareholder_of",
    'Sale-Purchase':"mergedwith",
    'Competition':"competitor_of",
    'Partnership':"collaboration",
    'Legal-proceeding':"undefined"}
test_df['label_core'] = test_df['label'].map(mapping)

In [22]:
test_df.head()

Unnamed: 0,text,actor1,actor2,label,label_core
747,major player operate in the global artificial ...,icarbonx,intel,Competition,competitor_of
2001,consumer package good cpg company take part in...,nestle,clorox,negative,undefined
1617,he have represent various bank include us exim...,bank of nova scotia,barclay,negative,undefined
334,aldi sam club costco kroger whole foods ahold ...,costco,publix,negative,undefined
794,it also include airplane manufacturer airbus s...,safran,sony,negative,undefined


In [11]:
l_actor1 = test_df['actor1'].to_list()
l_actor2 = test_df['actor2'].to_list()
l_relation = test_df['label'].to_list()
with open("test.tup", "w") as file:
    for i in range(len(l_actor1)):
        file.write(f"{l_actor1[i]} ; {l_actor2[i]} ; {l_relation[i]}\n")
        #file.write(f"{l_relation[i].strip()}\n")

l=train_df['text'].to_list()
l_actor1 = train_df['actor1'].to_list()
l_actor2 = train_df['actor2'].to_list()
l_relation = train_df['label'].to_list()
with open("train.tup", "w") as file:
    for i in range(len(l_actor1)):
        file.write(f"{l_actor1[i]} ; {l_actor2[i]} ; {l_relation[i]}\n")
        #file.write(f"{l_relation[i].strip()}\n")
with open("train.sent", "w") as fsent:
    for sentence in l:
        sentence = sentence.strip()
        fsent.write(sentence+"\n")
    

In [12]:
set(train_df['label'].to_list())

{'Competition',
 'Investment',
 'Legal-proceeding',
 'Partnership',
 'Sale-Purchase',
 'negative'}

In [13]:
test_df.head()

Unnamed: 0,text,actor1,actor2,label
4562,700 organization include 20th century fox jetb...,jetblue,nordstrom,negative
4362,stantec trade on the tsx and the nyse under th...,stantec,tsx,negative
6804,other global company envision this new form of...,volkswagen group,geely automobile holdings,negative
6848,2018 first championship lead sponsor include g...,google,magna international,negative
5645,well establish company such as procter gamble ...,johnson johnson,philip morris international,negative


In [14]:
relations_ = {'Competition',
 'Investment',
 'Legal-proceeding',
 'Partnership',
 'Sale-Purchase',
 'negative'}

In [1]:
#with open('mydata/relations.txt') as f:
#    relations = [r.strip() for r in f.readlines()]
relations=[r.strip() for r in relations_]
    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given phrases that describe the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"relation1: word1, word2; relation2: word3, word4\". Options: {', '.join(relations)}.")
        instructions.append(f"Given the input sentence, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"relation1: word1, word2; relation2: word3, word4\". Relations include: {'; '.join(relations)}.")
        inputs.extend([sent] * 2)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 2)
    
    if with_cls:
        for tup in tuples:
            instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            inputs.extend([sent] * 2)
            outputs.extend([tup[-1]] * 2)
            #inputs.extend([sent])
            #outputs.extend([tup[-1]])
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
    print("longuer = ",len(outputs))
    print("exemple : ",outputs[:10])
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

NameError: name 'relations_' is not defined

In [16]:
train_dataset = get_finred_dataset('mydata/train.sent', 'mydata/train.tup', with_orig=False, with_cls=True)
test_dataset = get_finred_dataset('mydata/test.sent', 'mydata/test.tup', with_orig=False, with_cls=True)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

longuer =  2006
exemple :  ['negative', 'Partnership', 'Competition', 'Partnership', 'negative', 'negative', 'Competition', 'negative', 'negative', 'Sale-Purchase']
longuer =  8028
exemple :  ['negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'Competition', 'negative', 'Competition']


Saving the dataset (1/1 shards): 100%|██████████| 2006/2006 [00:00<00:00, 59768.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8028/8028 [00:00<00:00, 149693.35 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 2006
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 8028
    })
})

In [30]:
train_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 3210
})

In [None]:
train_dataset = get_finred_dataset('mydata/train.sent', 'mydata/train.tup', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('mydata/test.sent', 'mydata/test.tup', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

In [54]:
doc_dict, question_dict = {}, {}
for i, row in docs.iterrows():
    doc_dict[row['docid']] = row['doc']
for i, row in questions.iterrows():
    question_dict[row['qid']] = row['question']
    
instruction_templates = [
    "Utilize your financial knowledge, give your answer or opinion to the input question or subject . Answer format is not limited.",
    "Offer your insights or judgment on the input financial query or topic using your financial expertise. Reply as normal question answering",
    "Based on your financial expertise, provide your response or viewpoint on the given financial question or topic. The response format is open.",
    "Share your insights or perspective on the financial matter presented in the input.",
    "Offer your thoughts or opinion on the input financial query or topic using your financial background."
]

inputs, outputs, instructions = [], [], []
for i, row in qa_pairs.iterrows():
    qid, docid = row['qid'], row['docid']
    q = str(question_dict[qid])
    doc = str(doc_dict[docid])
    inputs.append(q)
    outputs.append(doc)
    instructions.append(instruction_templates[i%5])

fiqa_qa_dataset = Dataset.from_dict({
    'input': inputs,
    'output': outputs,
    'instruction': instructions
})
fiqa_qa_dataset.save_to_disk('fingpt-fiqa_qa')
fiqa_qa_dataset

Saving the dataset (0/1 shards):   0%|          | 0/17110 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 17110
})

In [4]:
pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyarrow.parquet as pq


In [2]:
table_parquet = pq.read_table("train.parquet")

dataframe_parquet = table_parquet.to_pandas()



In [4]:
dataframe_parquet.shape

(27558, 3)