In [2]:
import json
import logging
from datasets import load_dataset
from together import Together

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client = Together()
dataset = load_dataset("gtfintechlab/FinRed")
print("Dataset loaded successfully.")

sample = dataset['test'][0]  # type: ignore
print("Sample Record:")
print(json.dumps(sample, indent=2))

Dataset loaded successfully.
Sample Record:
{
  "sentence": "Wednesday, July 8, 2015 10:30AM IST (5:00AM GMT) Rimini Street Comment on Oracle Litigation Las Vegas, United States Rimini Street, Inc., the leading independent provider of enterprise software support for SAP AG\u2019s (NYSE:SAP) Business Suite and BusinessObjects software and Oracle Corporation\u2019s (NYSE:ORCL) Siebel , PeopleSoft , JD Edwards , E-Business Suite , Oracle Database , Hyperion and Oracle Retail software, today issued a statement on the Oracle litigation.",
  "entities": [
    [
      "PeopleSoft",
      "JD Edwards"
    ]
  ],
  "relations": [
    "subsidiary"
  ]
}


In [None]:
# def finred_prompt(sentence: str, entity1: str, entity2: str, possible_relations: str):
#     system_prompt = """You are an expert in financial entity and relation extraction, particularly in entity-pair relationship classification."""
    
#     user_msg = f"""
#     Identify the relationship between [ENT1] {entity1} [/ENT1] and [ENT2] {entity2} [/ENT2] in the following sentence:
    
#     Sentence: "{sentence}"
    
#     Choose the relationship from this list:
#     {possible_relations}
    
#     If there is no valid relationship, respond with "NO_REL".
#     """
    
#     prompt = f"""<s>[INST] <<SYS>> {system_prompt} <</SYS>> {user_msg} [/INST]"""
#     return prompt


# Define correct prompt function for FinRED
def finred_prompt(sentence: str, entity1: str, entity2: str, relationship_options: str):
    return f"""Classify the relationship between [ENT1] {entity1} [/ENT1] and [ENT2] {entity2} [/ENT2] within the following sentence:
    "{sentence}"
    
    The relationship should match one of the following categories:
    {relationship_options}
    """

In [6]:
# Relationships from 'relations.txt'
possible_relationships = """
product/material produced
manufacturer
distributed by
industry
position held
original broadcaster
owned by
founded by
distribution format
headquarters location
stock exchange
currency
parent organization
chief executive officer
director/manager
owner of
operator
member of
employer
chairperson
platform
subsidiary
legal form
publisher
developer
brand
business division
location of formation
creator
NO_REL
"""
print("Possible relationships loaded.")


Possible relationships loaded.


In [15]:
# Cell 5: Checking Data Structure
# Let's add a print statement to inspect the structure
print("Dataset Structure Check")
print(type(dataset['test']))  # type: ignore # Print the type of 'test' data
print(dataset['test'][:2])    # type: ignore # Print the first two examples in the test data

# Iterate through the first 5 examples by index
for i in range(5):
    try:
        sentence = dataset['test']['sentence'][i] # type: ignore 
        entities = dataset['test']['entities'][i] # type: ignore 
        relations = dataset['test']['relations'][i] # type: ignore 
        
        # Loop through each entity pair and generate a prompt
        for entity_pair, true_relation in zip(entities, relations):
            entity1, entity2 = entity_pair
            prompt = finred_prompt(sentence, entity1, entity2, possible_relationships)
            
            # Make the API call
            response = client.chat.completions.create(
                model="mistralai/Mixtral-8x7B-Instruct-v0.1",
                messages=[{
                        "role": "system", 
                        "content": "You are an expert in financial entity and relation extraction, particularly in entity-pair relationship classification."
                    }, 
                    {
                        "role": "user", 
                        "content": prompt
                    }],
                max_tokens=128,
                temperature=0.7,
                top_k=50,
                top_p=0.7,
                repetition_penalty=1.1
            )
            
            # Extract response
            predicted_relation = response.choices[0].message.content.strip() # type: ignore
            
            # Output comparison
            print(f"\nSentence: {sentence}")
            print(f"Entity 1: {entity1}, Entity 2: {entity2}")
            print(f"True Relation: {true_relation}")
            print(f"Predicted Relation: {predicted_relation}")

    except TypeError as e:
        print("Encountered TypeError:", e)
        print("Please verify dataset structure and re-run.")

Dataset Structure Check
<class 'datasets.arrow_dataset.Dataset'>
{'sentence': ['Wednesday, July 8, 2015 10:30AM IST (5:00AM GMT) Rimini Street Comment on Oracle Litigation Las Vegas, United States Rimini Street, Inc., the leading independent provider of enterprise software support for SAP AG’s (NYSE:SAP) Business Suite and BusinessObjects software and Oracle Corporation’s (NYSE:ORCL) Siebel , PeopleSoft , JD Edwards , E-Business Suite , Oracle Database , Hyperion and Oracle Retail software, today issued a statement on the Oracle litigation.', 'The Daily Show with Trevor Noah premieres tonight... and while the show will be based on Comedy Central, Viacom pans to simulcast the debut across all of its networks, including VH1 and MTV.'], 'entities': [[['PeopleSoft', 'JD Edwards']], [['VH1', 'Viacom']]], 'relations': [['subsidiary'], ['owned_by']]}

Sentence: Wednesday, July 8, 2015 10:30AM IST (5:00AM GMT) Rimini Street Comment on Oracle Litigation Las Vegas, United States Rimini Street, I