In [4]:
!pip install openai
!pip install tiktoken



In [5]:
import pandas as pd
import numpy as np
import re
import os
import json
import openai
import time
import tiktoken
from collections import defaultdict, Counter
from pydantic import BaseModel
from typing import List, Dict, Tuple, Set
from tqdm import tqdm  # For progress tracking
import matplotlib.pyplot as plt

In [None]:
#1.Process Data

# function to extract sentences and annotations
def extract_data(text):
    data = []
    sentences = text.strip().split('\n')
    for sentence in sentences:
        parts = sentence.split('|')
        if len(parts) < 2:  # Ensure valid format
            continue  # Skip invalid lines
        text = parts[0].strip()
        annotations = [part.strip() for part in parts[1:]]
        data.append({'text': text, 'annotations': annotations})
    return data

# function to read and process a file
def process_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return extract_data(text)
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found.")
        return []
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return []

# Process train.txt
train_data = process_file('train.txt')
df_train = pd.DataFrame(train_data) if train_data else pd.DataFrame()
if not df_train.empty:
    print("\nTRAIN DATA:")
    pd.set_option('display.max_colwidth', None)
    print(df_train.head())
else:
    print("No data extracted from train.txt")

# Process dev.txt
dev_data = process_file('dev.txt')
df_dev = pd.DataFrame(dev_data) if dev_data else pd.DataFrame()
if not df_dev.empty:
    print("\nDEV DATA:")
    print(df_dev.head())
else:
    print("No data extracted from dev.txt")

# Process test.txt
test_data = process_file('test.txt')
df_test = pd.DataFrame(test_data) if test_data else pd.DataFrame()
if not df_test.empty:
    print("\nTEST DATA:")
    print(df_test.head())
else:
    print("No data extracted from test.txt")

# Now you have three separate DataFrames:
# df_train - containing training data
# df_dev - containing development/validation data
# df_test - containing test data


TRAIN DATA:
                                                                                                                                                                                                                                text  \
0              NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.   
1                                                          Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.   
2                                                             Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.   
3  Franz told Reuters that Fiat Chief Executive Sergio Marc

In [7]:
import re
import html

def clean_text(text):
    """
    Clean text by:
    1. Decoding HTML entities (like &#039; &#959; etc.)
    2. Removing HTML tags (like <p> </p>)
    """
    if not isinstance(text, str):
        return text
    
    # Decode HTML entities (converts &#039; to ', &#959; to ο, etc.)
    text = html.unescape(text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Replace Unicode right single quotation mark with a standard single quote
    text = text.replace('\u2019', "'")
    
    return text

# Apply cleaning to the 'text' column in all three dataframes
for df in [df_train, df_dev, df_test]:
    if not df.empty and 'text' in df.columns:
        df['text'] = df['text'].apply(clean_text)

print(df_train.iloc[2069])

text           New 'Bratz' more Taylor Swift than Britney: CEO Ashley Holt, CNBC CNBC.com SHARES Bratz makeover no child's play: CEO Isaac Larian, MGA Entertainment CEO, discusses the relaunch of the Bratz dolls and its battle to take on Mattel's Barbie.
annotations                                                                                                                                                                                                              [Bratz ; MGA Entertainment ; creator]
Name: 2069, dtype: object


In [8]:
import re

def clean_source_info(text):
    """
    Clean source information from the beginning of text.
    Looks for specific patterns in the first 50 characters and removes everything
    from the beginning to the end of the matched pattern.
    """
    if not isinstance(text, str):
        return text
    
    # Check only the first 50 characters (or all if less than 50)
    first_part = text[:50] if len(text) > 50 else text
    
    # Patterns to look for in the beginning
    patterns = [
        r'\(AP\) -+ ',
        r'\(AP\) _ ',
        r'\(AP\) — ',
        r'\(IANS\) ',
        r'\(Reuters\) - ',
        r'\(TheStreet\) -- ',
        r'\(GLOBE NEWSWIRE\) -- ',
        r'\(ShareCast\) - \(ShareCast News\) - ',
        r'CNBC CNBC\.com SHARES ',
        r'BST - '
    ]
    
    # Find the earliest match of any pattern
    earliest_match = None
    earliest_end = len(text)
    
    for pattern in patterns:
        match = re.search(pattern, first_part)
        if match and match.end() < earliest_end:
            earliest_match = match
            earliest_end = match.end()
    
    # If a match was found, remove everything up to the end of the match
    if earliest_match:
        return text[earliest_end:].strip()
    
    return text


def clean_source_info_2(text):
    """
    Cleans a text string based on the presence of " IST " and "|"
    """
    try:
        ist_index = text.find(" IST ")
        if ist_index != -1:
            pipe_index = text.rfind("|", 0, ist_index)  # Search backward for "|"

            if pipe_index != -1:
                return text[:pipe_index].strip() + text[ist_index + len(" IST "):].strip()
            else:
                return text[ist_index + len(" IST "):].strip()
        return text  # Return original text if " IST " is not found
    except AttributeError:
        return text  # Handle cases where the input might not be a string
    
# Apply cleaning to the 'text' column in all three dataframes
for df in [df_train, df_dev, df_test]:
    if not df.empty and 'text' in df.columns:
        df['text'] = df['text'].apply(clean_source_info)

for df in [df_train, df_dev, df_test]:
    if not df.empty and 'text' in df.columns:
        df['text'] = df['text'].apply(clean_source_info_2)


print(df_train.iloc[2119])

text           Oil and gas explorer Roxi Petroleum (Other OTC: ROXIF - news ) has reached an agreement to cancel royalty payments from its flagship asset.
annotations                                                                 [ROXI ; London ; location_of_formation, ROXI ; London ; headquarters_location]
Name: 2119, dtype: object


In [9]:
import re

def remove_parenthetical_expressions(text):
    """
    Remove all text within parentheses () and square brackets []
    from the given text.
    """
    if not isinstance(text, str):
        return text
    
    # Remove text within parentheses ()
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove text within square brackets []
    text = re.sub(r'\[[^\]]*\]', '', text)
    
    # Clean up any double spaces created by the removals
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning to the 'text' column in all three dataframes
for df in [df_train, df_dev, df_test]:
    if not df.empty and 'text' in df.columns:
        df['text'] = df['text'].apply(remove_parenthetical_expressions)

print(df_train.iloc[2127])

text           Chemicals group Elementis saw its first half pre-tax profit decline, as its oil and gas related and businesses reported a drop in sales.
annotations                                                                                                [Elementis ; London ; headquarters_location]
Name: 2127, dtype: object


In [10]:
import re

def clean_location_dash(text):
    """
    Clean location information from the beginning of text.
    Looks for "WORD - " pattern in the first 50 characters and 
    removes everything from the beginning to the end of the matched pattern.
    """
    if not isinstance(text, str):
        return text
    
    # Check only the first 50 characters (or all if less than 50)
    first_part = text[:50] if len(text) > 50 else text
    
    # Pattern to look for location followed by dash: word(s) followed by " - "
    # This will match patterns like "DUBLIN - ", "NEW YORK - ", etc.
    pattern = r'^\w+(\s+\w+)* - '
    
    match = re.search(pattern, first_part)
    if match:
        return text[match.end():].strip()
    
    return text

# Apply cleaning to the 'text' column in all three dataframes
for df in [df_train, df_dev, df_test]:
    if not df.empty and 'text' in df.columns:
        df['text'] = df['text'].apply(clean_location_dash)

print(df_train.iloc[4183])

text           National Bank of Belgium Governor Luc Coene speaks at the Deloitte Global Financial Industry Partner Meeting - 1030 GMT.
annotations                                                                                      [Luc Coene ; governor ; position_held]
Name: 4183, dtype: object


In [11]:
# Load and update relations from the file
with open('relations.txt', 'r') as file:
    relations = [line.replace("product/material produced", "product or material produced")
                     .replace("director/manager", "director or manager") 
                 for line in file.read().splitlines()]


# Convert to formatted string
relations_list = ', '.join(f"'{relation}'" for relation in relations)
 
print(relations_list)  # Check the updated relations

'product or material produced', 'manufacturer', 'distributed by', 'industry', 'position held', 'original broadcaster', 'owned by', 'founded by', 'distribution format', 'headquarters location', 'stock exchange', 'currency', 'parent organization', 'chief executive officer', 'director or manager', 'owner of', 'operator', 'member of', 'employer', 'chairperson', 'platform', 'subsidiary', 'legal form', 'publisher', 'developer', 'brand', 'business division', 'location of formation', 'creator'


In [12]:
def clean_annotations(annotations):  
    return [annotation.replace('_', ' ') for annotation in annotations]

# Define the function to parse annotations
def parse_annotation(annotation_str):
    """
    Parse an annotation string in the format "entity1 ; entity2 ; relation"
    Returns a tuple (entity1, entity2, relation) or None if parsing fails
    """
    try:
        parts = annotation_str.split(" ; ")
        if len(parts) == 3:
            entity1, entity2, relation = parts
            return (entity1, entity2, relation)
        return None
    except Exception:
        return None

# Define the directional relationship pairs
relationship_pairs = {
    "product or material produced": "manufacturer",
    "manufacturer": "product or material produced",
    "position held": "chief executive officer",
    "position held": "director or manager",
    "position held": "chairperson",
    "chief executive officer": "position held",
    "director or manager": "position held",
    "chairperson": "position held",
    "owned by": "owner of",
    "owner of": "owned by",
    "parent organization": "subsidiary",
    "subsidiary": "parent organization",
    "creator": "founded by",
    "founded by": "creator"
}

def process_annotations(annotations):
    """
    Process annotations in the given list of annotations.
    This function identifies existing relationships and adds missing reverse relationships.
    """
    existing_relations = set()

    # Extract existing relationships
    for annotation in annotations:
        # Replace "director_/_manager" with "director_or_manager" before parsing
        annotation = annotation.replace("director_/_manager", "director or manager")

        parsed = parse_annotation(annotation)
        if parsed:
            existing_relations.add(parsed)

    # Identify missing reverse relationships
    new_relations = set()
    for entity1, entity2, relation in existing_relations:
        if relation in relationship_pairs:
            reverse_relation = relationship_pairs[relation]
            if (entity2, entity1, reverse_relation) not in existing_relations:
                new_relations.add((entity2, entity1, reverse_relation))
        elif relation in relationship_pairs.values():
            # Get all keys that map to this value
            reverse_relations = [k for k, v in relationship_pairs.items() if v == relation]
            if reverse_relations:
                reverse_relation = reverse_relations[0]  # Take the first one
                if (entity2, entity1, reverse_relation) not in existing_relations:
                    new_relations.add((entity2, entity1, reverse_relation))

    # Return all relations combined as a list of formatted strings
    return list(annotations) + [" ; ".join(relation) for relation in new_relations]

# Process each DataFrame
for df in [df_train, df_dev, df_test]:
    if not df.empty and 'annotations' in df.columns:
        df['annotations'] = df['annotations'].apply(process_annotations)
        df['annotations'] = df['annotations'].apply(clean_annotations)

# Display the updated DataFrames
df_train

Unnamed: 0,text,annotations
0,"Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","[Apple Inc ; Steve Jobs ; founded by, Apple Inc ; Steve Jobs ; chief executive officer]"
1,"Last week, Citigroup Inc's Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.",[Vikram Pandit ; Citigroup ; employer]
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","[Lehman Brothers ; investment bank ; product or material produced, Lehman Brothers ; investment ; industry]"
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,[Sergio Marchionne ; Fiat ; employer]
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.",[Sergio Marchionne ; Fiat ; employer]
...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE would be five times cheaper than new ones.",[Airbus ; aircraft ; product or material produced]
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.",[Dettol ; Reckitt ; owned by]
5697,Related articles Mark Carney is Governor of the Bank of England,[Mark Carney ; Governor of the Bank of England ; position held]
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.",[eBay ; e-commerce ; industry]


In [13]:
# Function to process annotations into a set of triplets
def convert_to_triplet_set(annotation):
    if isinstance(annotation, str):  # If stored as a string
        annotation = annotation.strip("[]")  # Remove brackets if mistakenly included
        annotation = annotation.split(",")  # Split into list
 
    if isinstance(annotation, list):  # Ensure it's a list now
        triplets = set()
        for item in annotation:
            parts = [x.strip() for x in item.split(";")]  # Split on ';' and remove extra spaces
            if len(parts) == 3:  # Only take valid triplets
                triplets.add(tuple(parts))
        return triplets
    return annotation  # Return original if not processable
 
# Apply transformation

for df in [df_train, df_dev, df_test]:
    df["annotations"] = df["annotations"].apply(convert_to_triplet_set)

df_train


Unnamed: 0,text,annotations
0,"Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","{(Apple Inc, Steve Jobs, founded by), (Apple Inc, Steve Jobs, chief executive officer)}"
1,"Last week, Citigroup Inc's Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.","{(Vikram Pandit, Citigroup, employer)}"
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","{(Lehman Brothers, investment, industry), (Lehman Brothers, investment bank, product or material produced)}"
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,"{(Sergio Marchionne, Fiat, employer)}"
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.","{(Sergio Marchionne, Fiat, employer)}"
...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE would be five times cheaper than new ones.","{(Airbus, aircraft, product or material produced)}"
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.","{(Dettol, Reckitt, owned by)}"
5697,Related articles Mark Carney is Governor of the Bank of England,"{(Mark Carney, Governor of the Bank of England, position held)}"
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.","{(eBay, e-commerce, industry)}"


In [14]:
prompt = f"""
You are an expert in relationship extraction. Your task is to extract relationships between entities in the given text.

## Output Format
Return a JSON object:
```json
{{
  "relations": [
    ["Entity1", "Entity2", "relation_type"],
    ...
  ]
}}

---



---

### Steps to follow :

Steps for Extraction
1. Read the entire sentence and understand the contextual meaning of the entire sentence.
2. Identify Entities

•⁠  Extract all relevant entities such as
- ORG: Companies, corporations, agencies (Apple, Goldman Sachs, NASA)
- PERSON: Individual humans (Tim Cook, Janet Yellen)
- PRODUCT: Goods, services, software (iPhone, Windows 11)
- LOCATION: Geographic places (New York, Japan)
- FINANCE: Currencies, exchanges (USD, NASDAQ)
- TITLE: Job positions (CEO, Director)
- BRAND: Consumer identities (Lexus, Instagram)
- PLATFORM: Digital ecosystems (iOS, AWS)

•⁠  Ignore entities without a direct connection to another entity.

3. Pair Entities Only If a Strong Connection Exists based on (ins_dic)
Do not generate all possible entity pairs.
- ORG-ORG:
- owned by
- parent organization
- owner of
- subsidiary
- business division
- platform
- operator
- brand
- stock exchange
- legal form

ORG-PERSON:
- owned by
- employer
- member of
- founded by

PERSON-ORG:
- creator
- member of
- chief executive officer
- director or manager
- chairperson
- owner of

ORG-LOCATION:
- headquarters location
- location of formation

ORG-PRODUCT:
- manufacturer
- distributed by
- developer
- industry

PRODUCT-ORG:
- original broadcaster
- publisher
- product or material produced

PERSON-PRODUCT:
- developer
- manufacturer

PERSON-TITLE:
- position held
- chief executive officer
- director or manager
- chairperson

PRODUCT-LOCATION:
- distribution format

PERSON-PERSON:
- position held
- member of

PRODUCT-PRODUCT:
- business division
- platform

ORG-FINANCE:
- stock exchange
- currency

ORG-TITLE:
- chief executive officer
- position held

ORG-BRAND:
- brand
- subsidiary



A relationship must be clearly stated in the text.

Example:

"Steve Jobs is the CEO of Apple." → ["Apple", "Steve Jobs", "chief executive officer"]

"Steve Jobs and Apple are mentioned in the same sentence." → No relation extracted.

4. Assign the Correct Relationship
Extract relationships only when explicitly stated or strongly implied.

Do not assume ownership, employment, or affiliation unless clearly described.

Certain relations require specific phrases:

"parent organization" → Only if the text states "X is the parent company of Y".

"owned by" → Only if ownership is explicitly stated (e.g., "X acquired Y").


If they are not mentioned here, ignore and do not extract the relationship.

IMPORTANT:
•⁠  A relation MUST be supported by clear indicators in the sentence (e.g., verbs like "owns", "developed by", or structural patterns).
•⁠  Do not assume relations just because entities are mentioned together.
•⁠  Directionality matters: ["X", "Y", "owner of"] ≠ ["Y", "X", "owned by"]
•⁠  Assume the first entity in the sentence by sequence is the subject.  relations and entity 2 follows.
•⁠  consider that there are bidirectional prompts: parent organization is the inverse of Subsidiary. The same way owner of is the inverse of owned by. Extract all where applies.
- handle overlap notations

## Examples

Text: "Apple CEO Tim Cook announced the new iPhone 12 at the company's headquarters in Cupertino."
CORRECT:
- [Apple, Tim Cook, employer] (ORG-PERSON)
- [Tim Cook, CEO, position held] (PERSON-TITLE)
- [Apple, iPhone 12, manufacturer] (ORG-PRODUCT)
- [Apple, Cupertino, headquarters location] (ORG-LOCATION)

Text: "Tim Cook visited London yesterday."
INCORRECT:
- [Tim Cook, London, visited] - "visited" is not in our relation list

Text: "Google partners with Microsoft on cloud solutions."
INCORRECT:
- [Google, Microsoft, partner] - "partner" is not in our relation list

---
Now extract relationships of these texts :
"""

In [15]:
# Output file paths
training_file = "train.jsonl"
validation_file = "validation.jsonl"

# Process the training dataset
with open(training_file, "w", encoding="utf-8") as outfile:
    for _, row in df_train.iterrows():
        text = row["text"].strip()
        triplets = row["annotations"]
        
        # Convert triplets set to JSON format
        triplets_str = "; ".join([f"({e1}, {e2}, {rel})" for e1, e2, rel in triplets])
        
        # Create JSONL entry
        entry = {
            "messages": [
                {"role": "system", "content": "You are an expert in relationship extraction that responds in JSON format. Your task is to extract only clear and explicitly stated relationships between entities in the given text. You MUST identify at least one relationship in each text."},
                {"role": "user", "content": f"{prompt}\nText to analyze: \n{text}"},
                {"role": "assistant", "content": triplets_str}
            ]
        }
        
        # Write to JSONL file
        outfile.write(json.dumps(entry) + "\n")

# Process the validation dataset
with open(validation_file, "w", encoding="utf-8") as outfile:
    for _, row in df_dev.iterrows():
        text = row["text"].strip()
        triplets = row["annotations"]
        
        # Convert triplets set to JSON format
        triplets_str = "; ".join([f"({e1}, {e2}, {rel})" for e1, e2, rel in triplets])
        
        # Create JSONL entry
        entry = {
            "messages": [
                {"role": "system", "content": "You are a relationship extraction model."},
                {"role": "user", "content": f"Extract relationships from the following text:\n{text}"},
                {"role": "assistant", "content": triplets_str}
            ]
        }
        
        # Write to JSONL file
        outfile.write(json.dumps(entry) + "\n")

print(f"Conversion complete. Saved as {training_file}")
print(f"Conversion complete. Saved as {validation_file}")

Conversion complete. Saved as train.jsonl
Conversion complete. Saved as validation.jsonl


In [16]:
import json
from collections import defaultdict

# Data paths
training_file = "train.jsonl"
validation_file = "validation.jsonl"

# Function to check format errors in a dataset
def check_dataset_format(data_path):
    # Load the dataset
    with open(data_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Initial dataset stats
    print(f"\nChecking {data_path}:")
    print("Num examples:", len(dataset))
    print("First example:")
    for message in dataset[0]["messages"]:
        print(message)

    # Format error checks
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue
            
        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue
            
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            
            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                
            content = message.get("content", None)
            function_call = message.get("function_call", None)
            
            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1
        
        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")
    
    return format_errors

# Check both datasets
training_errors = check_dataset_format(training_file)
validation_errors = check_dataset_format(validation_file)

# Overall summary
print("\nOverall Format Check Summary:")
if not training_errors and not validation_errors:
    print("Both datasets passed all format checks.")
else:
    if training_errors:
        print(f"Training dataset ({training_file}) has format errors.")
    else:
        print(f"Training dataset ({training_file}) passed all checks.")
        
    if validation_errors:
        print(f"Validation dataset ({validation_file}) has format errors.")
    else:
        print(f"Validation dataset ({validation_file}) passed all checks.")



Checking train.jsonl:
Num examples: 5700
First example:
{'role': 'system', 'content': 'You are an expert in relationship extraction that responds in JSON format. Your task is to extract only clear and explicitly stated relationships between entities in the given text. You MUST identify at least one relationship in each text.'}
{'role': 'user', 'content': '\nYou are an expert in relationship extraction. Your task is to extract relationships between entities in the given text.\n\n## Output Format\nReturn a JSON object:\n```json\n{\n  "relations": [\n    ["Entity1", "Entity2", "relation_type"],\n    ...\n  ]\n}\n\n---\n\n\n\n---\n\n### Steps to follow :\n\nSteps for Extraction\n1. Read the entire sentence and understand the contextual meaning of the entire sentence.\n2. Identify Entities\n\n•\u2060  Extract all relevant entities such as\n- ORG: Companies, corporations, agencies (Apple, Goldman Sachs, NASA)\n- PERSON: Individual humans (Tim Cook, Janet Yellen)\n- PRODUCT: Goods, services,

In [17]:
import tiktoken
import numpy as np
import json

# Initialize the tokenizer
encoding = tiktoken.get_encoding("cl100k_base")

# Define token counting functions
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

def analyze_dataset(file_path, dataset_name):
    print(f"\n\n{'='*50}")
    print(f"Analyzing {dataset_name} dataset: {file_path}")
    print(f"{'='*50}")
    
    # Load the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]
    
    print(f"Number of examples: {len(dataset)}")
    
    # Warnings and tokens counts
    n_missing_system = 0
    n_missing_user = 0
    n_messages = []
    convo_lens = []
    assistant_message_lens = []
    
    for ex in dataset:
        messages = ex["messages"]
        if not any(message["role"] == "system" for message in messages):
            n_missing_system += 1
        if not any(message["role"] == "user" for message in messages):
            n_missing_user += 1
        n_messages.append(len(messages))
        convo_lens.append(num_tokens_from_messages(messages))
        assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
        
    print("Num examples missing system message:", n_missing_system)
    print("Num examples missing user message:", n_missing_user)
    print_distribution(n_messages, "num_messages_per_example")
    print_distribution(convo_lens, "num_total_tokens_per_example")
    print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
    n_too_long = sum(l > 16385 for l in convo_lens)
    print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")
    
    # Pricing and default n_epochs estimate
    MAX_TOKENS_PER_EXAMPLE = 16385
    
    TARGET_EPOCHS = 3
    MIN_TARGET_EXAMPLES = 100
    MAX_TARGET_EXAMPLES = 25000
    MIN_DEFAULT_EPOCHS = 1
    MAX_DEFAULT_EPOCHS = 25
    
    n_epochs = TARGET_EPOCHS
    n_train_examples = len(dataset)
    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
        n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
        n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
    
    n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
    print(f"\nDataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
    print(f"By default, we'll train for {n_epochs} epochs ~ {n_epochs * n_billing_tokens_in_dataset} tokens on this dataset")
    
    return {
        "examples": n_train_examples,
        "billing_tokens": n_billing_tokens_in_dataset,
        "recommended_epochs": n_epochs,
        "over_limit_examples": n_too_long
    }

# Define file paths
training_file = "train.jsonl"
validation_file = "validation.jsonl"

# Analyze both datasets
train_stats = analyze_dataset(training_file, "Training")
val_stats = analyze_dataset(validation_file, "Validation")

# Print combined summary
print("\n\n" + "="*50)
print("COMBINED SUMMARY")
print("="*50)
print(f"Training dataset: {train_stats['examples']} examples, ~{train_stats['billing_tokens']} tokens")
print(f"Validation dataset: {val_stats['examples']} examples, ~{val_stats['billing_tokens']} tokens")
print(f"Total billable tokens: ~{train_stats['billing_tokens'] + val_stats['billing_tokens']}")
print(f"Recommended epochs for training: {train_stats['recommended_epochs']}")
print(f"Total examples over token limit: {train_stats['over_limit_examples'] + val_stats['over_limit_examples']}")



Analyzing Training dataset: train.jsonl
Number of examples: 5700
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 1001, 2415
mean / median: 1053.897894736842, 1046.0
p5 / p95: 1026.0, 1078.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 6, 418
mean / median: 15.971929824561403, 11.0
p5 / p95: 8.0, 30.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning

Dataset has ~6007218 tokens that will be charged for during training
By default, we'll train for 3 epochs ~ 18021654 tokens on this dataset


Analyzing Validation dataset: validation.jsonl
Number of examples: 1007
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p9

In [None]:
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", API_KEY))

In [None]:
train_set_file = client.files.create(
    file = open('train.jsonl', 'rb'),
    purpose = "fine-tune"
)

validation_set_file = client.files.create(
    file = open('validation.jsonl', 'rb'),
    purpose = "fine-tune"
)

print(train_set_file.id)
print(validation_set_file.id)

file-AWi3taR96HGWEvfA2qJgsd
file-TZPByYeqjYWvkfcJaxhv7W


In [None]:
response = client.fine_tuning.jobs.create(
    training_file=train_set_file.id,
#    validation_file=validation_set_file.id,
    model='gpt-4o-mini-2024-07-18',
    hyperparameters={
#        "n_epochs": 5,
#        "batch_size": 32,
#        "learning_rate_multiplier": 0.5
    }
)
print(response.id)
print(response.fine_tuned_model)
print(response.result_files)
print(response.trained_tokens)

ftjob-tGLqiT9XbKfaCGS6RFD0M3At
None
[]
None


In [None]:
client.fine_tuning.jobs.retrieve(response.id)

FineTuningJob(id='ftjob-tGLqiT9XbKfaCGS6RFD0M3At', created_at=1743682051, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:university-edinburgh::BIEAkmsC', finished_at=1743684079, hyperparameters=Hyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-uAtoImi2lqXXzOw9bIRsBaOY', result_files=['file-82KA5cEotZ9a3siAK9X6o6'], seed=1336941539, status='succeeded', trained_tokens=12653469, training_file='file-AWi3taR96HGWEvfA2qJgsd', validation_file=None, estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [None]:
client.fine_tuning.jobs.retrieve('ftjob-M2bzBopasxGgnvkeydJwJfvF')

FineTuningJob(id='ftjob-M2bzBopasxGgnvkeydJwJfvF', created_at=1743681996, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:university-edinburgh::BIELCI6l', finished_at=1743684727, hyperparameters=Hyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-uAtoImi2lqXXzOw9bIRsBaOY', result_files=['file-9NtLYpcfLqxESoRMLcvsQo'], seed=1084706154, status='succeeded', trained_tokens=12653469, training_file='file-AWi3taR96HGWEvfA2qJgsd', validation_file='file-TZPByYeqjYWvkfcJaxhv7W', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [None]:
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-tGLqiT9XbKfaCGS6RFD0M3At', created_at=1743682051, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:university-edinburgh::BIEAkmsC', finished_at=1743684079, hyperparameters=Hyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-uAtoImi2lqXXzOw9bIRsBaOY', result_files=['file-82KA5cEotZ9a3siAK9X6o6'], seed=1336941539, status='succeeded', trained_tokens=12653469, training_file='file-AWi3taR96HGWEvfA2qJgsd', validation_file=None, estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-M2bzBopasxGgnvkeydJwJfvF', created_at=1743681996, error=Error(code=None, message=N

In [29]:
import base64
import pandas as pd
import io


result_files = 'file-82KA5cEotZ9a3siAK9X6o6'
job_id = 'ftjob-tGLqiT9XbKfaCGS6RFD0M3At'

def get_finetuning_metrics(client, job_id):
    def replace_colons_with_underscored(input_string):
        return(input_string).replace(':', '_')
    
    fine_tune_job = client.fine_tuning.jobs.retrieve(job_id)

    file_id = fine_tune_job.result_files[0]

    response = client.files.content(file_id)

    decoded_content = base64.b64decode(response.content).decode('utf-8')

    df = pd.read_csv(io.StringIO(decoded_content))

    return df

get_finetuning_metrics(client, job_id)

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy,train_mean_reward,full_validation_mean_reward
0,1,4.34239,0.54545,,,,
1,2,3.75728,0.59600,,,,
2,3,4.92078,0.50649,,,,
3,4,4.28701,0.56186,,,,
4,5,5.72796,0.47656,,,,
...,...,...,...,...,...,...,...
1550,1551,0.06813,0.96471,,,,
1551,1552,0.02499,0.98788,,,,
1552,1553,0.13753,0.96648,,,,
1553,1554,0.04711,0.98658,,,,


In [None]:
# Get the metrics dataframe
metrics_df = get_finetuning_metrics(client, job_id)

# Display only train_loss and train_accuracy columns
print("Fine-tuning metrics for job:", job_id)
print("-" * 80)

# Check if the columns exist (column names might vary)
if 'train_loss' in metrics_df.columns and 'train_accuracy' in metrics_df.columns:
    # Print only these two columns with step for context
    print(metrics_df[['step', 'train_loss', 'train_accuracy']].to_string(index=False))
elif 'training_loss' in metrics_df.columns and 'training_accuracy' in metrics_df.columns:
    # Alternative column names
    print(metrics_df[['step', 'training_loss', 'training_accuracy']].to_string(index=False))
else:
    # If column names are different, print available columns
    print("Columns 'train_loss' and 'train_accuracy' not found.")
    print("Available columns:", metrics_df.columns.tolist())
    
    # Try to find columns with similar names
    loss_cols = [col for col in metrics_df.columns if 'loss' in col.lower()]
    accuracy_cols = [col for col in metrics_df.columns if 'accuracy' in col.lower() or 'acc' in col.lower()]
    
    if loss_cols and accuracy_cols:
        print(f"Using columns: {loss_cols[0]} and {accuracy_cols[0]}")
        print(metrics_df[['step', loss_cols[0], accuracy_cols[0]]].to_string(index=False))