In [None]:
!pip install google-genai
!pip install tiktoken







In [2]:
from google import genai
from google.genai import types
import pandas as pd
import numpy as np
import re
import os
import json
import base64
import time
import tiktoken
from collections import defaultdict, Counter
from pydantic import BaseModel
from typing import List, Dict, Tuple, Set
from tqdm import tqdm  # For progress tracking
import matplotlib.pyplot as plt

In [3]:
#1.Process Data
# function to extract sentences and annotations
def extract_data(text):
    data = []
    sentences = text.strip().split('\n')
    for sentence in sentences:
        parts = sentence.split('|')
        if len(parts) < 2:  # Ensure valid format
            continue  # Skip invalid lines
        text = parts[0].strip()
        annotations = [part.strip() for part in parts[1:]]
        data.append({'text': text, 'annotations': annotations})
    return data

# reading the file
with open(r'train.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# extracting data
data = extract_data(text)

# converting to DataFrame for easier handling
df = pd.DataFrame(data)
pd.set_option('display.max_colwidth', None)
print(df.head())

                                                                                                                                                                                                                                text  \
0              NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.   
1                                                          Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.   
2                                                             Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.   
3  Franz told Reuters that Fiat Chief Executive Sergio Marchionne had sa

In [4]:
# cleaning function

def clean_text(text: str) -> str:
    """
    Cleans text by removing:
    - HTML tags
    - HTML entities
    - Non-ASCII characters
    - Control characters
    - Extra spaces
    """
    #Replace &#039 with '
    text = text.replace('&#039;',"'")
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove HTML entities (like &quot;, &#039)
    text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Remove control characters
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    # Normalize quotes and apostrophes
    text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    text = re.sub(r'[“”‘’]', '"', text)  # Normalize quotes
    # Remove multiple spaces and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

     # Removing source citations and converting to lowercase
    #cleaned_text = re.sub(r'\([^)]*\)', '', text).lower()  # Remove citations in parentheses
    # Keep punctuation like periods and commas for sentence boundaries but remove other punctuation
    #cleaned_text = re.sub(r'[^\w\s.,!?]', '', cleaned_text)
    # Replace multiple spaces with a single space
    #cleaned_text = re.sub(r'[“”‘’]', '"', text)  # Normalize quotes
    #cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return text


def clean_annotations(annotations):  
    return [annotation.replace('_', ' ') for annotation in annotations] 
    
df['text'] = df['text'].apply(clean_text)
df['annotations'] = df['annotations'].apply(clean_annotations)
df

Unnamed: 0,text,annotations
0,"NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","[Apple Inc ; Steve Jobs ; founded by, Apple Inc ; Steve Jobs ; chief executive officer]"
1,"Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.",[Vikram Pandit ; Citigroup ; employer]
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","[Lehman Brothers ; investment bank ; product or material produced, Lehman Brothers ; investment ; industry]"
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,[Sergio Marchionne ; Fiat ; employer]
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.",[Sergio Marchionne ; Fiat ; employer]
...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE (Xetra: A1XBMK - news ) would be five times cheaper than new ones.",[Airbus ; aircraft ; product or material produced]
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.",[Dettol ; Reckitt ; owned by]
5697,Related articles Mark Carney is Governor of the Bank of England,[Mark Carney ; Governor of the Bank of England ; position held]
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.",[eBay ; e-commerce ; industry]


In [5]:
# Load and update relations from the file
with open('relations.txt', 'r') as file:
    relations = [line.replace("product/material produced", "product or material produced")
                     .replace("director/manager", "director / manager") 
                 for line in file.read().splitlines()]
 
# Convert to formatted string
relations_list = ', '.join(f"'{relation}'" for relation in relations)
 
print(relations_list)  # Check the updated relations

'product or material produced', 'manufacturer', 'distributed by', 'industry', 'position held', 'original broadcaster', 'owned by', 'founded by', 'distribution format', 'headquarters location', 'stock exchange', 'currency', 'parent organization', 'chief executive officer', 'director / manager', 'owner of', 'operator', 'member of', 'employer', 'chairperson', 'platform', 'subsidiary', 'legal form', 'publisher', 'developer', 'brand', 'business division', 'location of formation', 'creator'


In [6]:
# Define the function to parse annotations
def parse_annotation(annotation_str):
    """
    Parse an annotation string in the format "entity1 ; entity2 ; relation"
    Returns a tuple (entity1, entity2, relation) or None if parsing fails
    """
    try:
        parts = annotation_str.split(" ; ")
        if len(parts) == 3:
            entity1, entity2, relation = parts
            return (entity1, entity2, relation)
        return None
    except Exception:
        return None

# Define the directional relationship pairs
relationship_pairs = {
    "product or material produced": "manufacturer",
    "manufacturer": "product or material produced",
    "position held": "employer",
    "employer": "position held",
    "owned by": "owner of",
    "owner of": "owned by",
    "parent organization": "subsidiary",
    "subsidiary": "parent organization",
}

# Re-run the annotation processing with the defined relationship pairs
for index, row in df.iterrows():
    existing_relations = set()

    # Extract existing relationships
    for annotation in row["annotations"]:
        parsed = parse_annotation(annotation)
        if parsed:
            existing_relations.add(parsed)

    # Identify missing reverse relationships
    new_relations = set()
    for entity1, entity2, relation in existing_relations:
        if relation in relationship_pairs:
            reverse_relation = relationship_pairs[relation]
            if (entity2, entity1, reverse_relation) not in existing_relations:
                new_relations.add((entity2, entity1, reverse_relation))
        elif relation in relationship_pairs.values():
            # Note: This may have issues if multiple keys map to the same value
            # Get all keys that map to this value
            reverse_relations = [k for k, v in relationship_pairs.items() if v == relation]
            if reverse_relations:
                reverse_relation = reverse_relations[0]  # Take the first one
                if (entity2, entity1, reverse_relation) not in existing_relations:
                    new_relations.add((entity2, entity1, reverse_relation))

    # Append new relations to existing ones, ensuring all annotations remain in a single list per row
    row["annotations"].extend([" ; ".join(relation) for relation in new_relations])

# Display the updated dataframe
df


Unnamed: 0,text,annotations
0,"NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","[Apple Inc ; Steve Jobs ; founded by, Apple Inc ; Steve Jobs ; chief executive officer]"
1,"Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.","[Vikram Pandit ; Citigroup ; employer, Citigroup ; Vikram Pandit ; position held]"
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","[Lehman Brothers ; investment bank ; product or material produced, Lehman Brothers ; investment ; industry, investment bank ; Lehman Brothers ; manufacturer]"
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,"[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]"
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.","[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]"
...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE (Xetra: A1XBMK - news ) would be five times cheaper than new ones.","[Airbus ; aircraft ; product or material produced, aircraft ; Airbus ; manufacturer]"
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.","[Dettol ; Reckitt ; owned by, Reckitt ; Dettol ; owner of]"
5697,Related articles Mark Carney is Governor of the Bank of England,"[Mark Carney ; Governor of the Bank of England ; position held, Governor of the Bank of England ; Mark Carney ; employer]"
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.",[eBay ; e-commerce ; industry]


In [7]:
# Function to process annotations into a set of triplets
def convert_to_triplet_set(annotation):
    if isinstance(annotation, str):  # If stored as a string
        annotation = annotation.strip("[]")  # Remove brackets if mistakenly included
        annotation = annotation.split(",")  # Split into list
 
    if isinstance(annotation, list):  # Ensure it's a list now
        triplets = set()
        for item in annotation:
            parts = [x.strip() for x in item.split(";")]  # Split on ';' and remove extra spaces
            if len(parts) == 3:  # Only take valid triplets
                triplets.add(tuple(parts))
        return triplets
    return annotation  # Return original if not processable
 
# Apply transformation
df["structured_annotations"] = df["annotations"].apply(convert_to_triplet_set)
 
df

Unnamed: 0,text,annotations,structured_annotations
0,"NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","[Apple Inc ; Steve Jobs ; founded by, Apple Inc ; Steve Jobs ; chief executive officer]","{(Apple Inc, Steve Jobs, founded by), (Apple Inc, Steve Jobs, chief executive officer)}"
1,"Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.","[Vikram Pandit ; Citigroup ; employer, Citigroup ; Vikram Pandit ; position held]","{(Vikram Pandit, Citigroup, employer), (Citigroup, Vikram Pandit, position held)}"
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","[Lehman Brothers ; investment bank ; product or material produced, Lehman Brothers ; investment ; industry, investment bank ; Lehman Brothers ; manufacturer]","{(Lehman Brothers, investment bank, product or material produced), (Lehman Brothers, investment, industry), (investment bank, Lehman Brothers, manufacturer)}"
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,"[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]","{(Sergio Marchionne, Fiat, employer), (Fiat, Sergio Marchionne, position held)}"
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.","[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]","{(Sergio Marchionne, Fiat, employer), (Fiat, Sergio Marchionne, position held)}"
...,...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE (Xetra: A1XBMK - news ) would be five times cheaper than new ones.","[Airbus ; aircraft ; product or material produced, aircraft ; Airbus ; manufacturer]","{(Airbus, aircraft, product or material produced), (aircraft, Airbus, manufacturer)}"
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.","[Dettol ; Reckitt ; owned by, Reckitt ; Dettol ; owner of]","{(Reckitt, Dettol, owner of), (Dettol, Reckitt, owned by)}"
5697,Related articles Mark Carney is Governor of the Bank of England,"[Mark Carney ; Governor of the Bank of England ; position held, Governor of the Bank of England ; Mark Carney ; employer]","{(Mark Carney, Governor of the Bank of England, position held), (Governor of the Bank of England, Mark Carney, employer)}"
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.",[eBay ; e-commerce ; industry],"{(eBay, e-commerce, industry)}"


In [8]:
def classify_overlap_annotations(relations: Set[Tuple[str, str, str]]) -> Set[Tuple[str, str, str, str]]:
    """
    Identify overlap type for each relationship and return annotated tuples.
    Works with a set of (entity1, entity2, relation) tuples.
 
    Parameters:
        relations (Set[Tuple[str, str, str]]): Set of extracted relationships in the format:
        {
            ("Apple Inc", "Steve Jobs", "founded by"),
            ("Apple Inc", "Steve Jobs", "chief executive officer"),
            ("Apple Inc", "Tim Cook", "chief executive officer")
        }
 
    Returns:
        Set[Tuple[str, str, str, str]]: Set of annotated relationships in the format:
        {
            ("Apple Inc", "Steve Jobs", "founded by", "EPO"),
            ("Apple Inc", "Steve Jobs", "chief executive officer", "EPO"),
            ("Apple Inc", "Tim Cook", "chief executive officer", "SEO"),
        }
    """
    
    entity_pairs = defaultdict(int)
    individual_counts = defaultdict(int)
    overlap_annotations = set()
    
    if not isinstance(relations, set) or not relations:  # Ensure relations is a non-empty set
        return set()  # Return empty set for invalid input
    
    # First pass: collect counts for entity pairs and individual entities
    for entity1, entity2, relation in relations:
        # Normalize the entity pair to make order irrelevant
        pair = tuple(sorted([entity1, entity2]))
        
        # Increment count for the entity pair
        entity_pairs[pair] += 1
        
        # Increment count for each individual entity
        individual_counts[entity1] += 1
        individual_counts[entity2] += 1
    
    # Second pass: annotate each relationship with its overlap type
    for entity1, entity2, relation in relations:
        pair = tuple(sorted([entity1, entity2]))

        # Determine overlap types
        if entity_pairs[pair] >= 2:
            overlap_annotations.add((entity1, entity2, relation, "EPO"))
        elif individual_counts[entity1] >= 2 or individual_counts[entity2] >= 2:
            overlap_annotations.add((entity1, entity2, relation, "SEO"))
        else:
            overlap_annotations.add((entity1, entity2, relation, "NEO"))
    
    return overlap_annotations

# Apply the new function to create the overlap_annotations column
df["overlap_annotations"] = df["structured_annotations"].apply(classify_overlap_annotations)

# Create the new column 'multi_annotations' by counting the number of tuples in each entry
df["multi_annotations"] = df["structured_annotations"].apply(len)

df

Unnamed: 0,text,annotations,structured_annotations,overlap_annotations,multi_annotations
0,"NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.","[Apple Inc ; Steve Jobs ; founded by, Apple Inc ; Steve Jobs ; chief executive officer]","{(Apple Inc, Steve Jobs, founded by), (Apple Inc, Steve Jobs, chief executive officer)}","{(Apple Inc, Steve Jobs, founded by, EPO), (Apple Inc, Steve Jobs, chief executive officer, EPO)}",2
1,"Last week, Citigroup Inc's ( C.N ) Chief Executive Vikram Pandit said that he, Chairman Win Bischoff, and senior adviser Robert Rubin would not receive bonuses for 2008.","[Vikram Pandit ; Citigroup ; employer, Citigroup ; Vikram Pandit ; position held]","{(Vikram Pandit, Citigroup, employer), (Citigroup, Vikram Pandit, position held)}","{(Vikram Pandit, Citigroup, employer, EPO), (Citigroup, Vikram Pandit, position held, EPO)}",2
2,"Lehman Brothers LEH.N shares fell sharply on Monday on speculation that the investment bank could be bought for $15 a share, a price well below current market levels.","[Lehman Brothers ; investment bank ; product or material produced, Lehman Brothers ; investment ; industry, investment bank ; Lehman Brothers ; manufacturer]","{(Lehman Brothers, investment bank, product or material produced), (Lehman Brothers, investment, industry), (investment bank, Lehman Brothers, manufacturer)}","{(Lehman Brothers, investment bank, product or material produced, EPO), (Lehman Brothers, investment, industry, SEO), (investment bank, Lehman Brothers, manufacturer, EPO)}",3
3,Franz told Reuters that Fiat Chief Executive Sergio Marchionne had said at a meeting on Monday he foresaw closing Opel's Kaiserslautern engine plant in Germany and other Fiat and Opel manufacturing sites in England and Italy.,"[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]","{(Sergio Marchionne, Fiat, employer), (Fiat, Sergio Marchionne, position held)}","{(Sergio Marchionne, Fiat, employer, EPO), (Fiat, Sergio Marchionne, position held, EPO)}",2
4,"""In an industry that has a poor track record for M&A execution, Fiat CEO Sergio Marchionne has his work cut out for him,"" said Morgan Stanley analyst Adam Jonas.","[Sergio Marchionne ; Fiat ; employer, Fiat ; Sergio Marchionne ; position held]","{(Sergio Marchionne, Fiat, employer), (Fiat, Sergio Marchionne, position held)}","{(Sergio Marchionne, Fiat, employer, EPO), (Fiat, Sergio Marchionne, position held, EPO)}",2
...,...,...,...,...,...
5695,"In particular, he said leases of used A330-200 aircraft from Airbus Group SE (Xetra: A1XBMK - news ) would be five times cheaper than new ones.","[Airbus ; aircraft ; product or material produced, aircraft ; Airbus ; manufacturer]","{(Airbus, aircraft, product or material produced), (aircraft, Airbus, manufacturer)}","{(aircraft, Airbus, manufacturer, EPO), (Airbus, aircraft, product or material produced, EPO)}",2
5696,"The company is an omnipresent in households the world over thanks to its operations across multiple consumer goods markets -- from Nurofen pain suppressants and Dettol disinfectant through to French's mustard, Reckitt Benckiser has its fingers in many pies.","[Dettol ; Reckitt ; owned by, Reckitt ; Dettol ; owner of]","{(Reckitt, Dettol, owner of), (Dettol, Reckitt, owned by)}","{(Reckitt, Dettol, owner of, EPO), (Dettol, Reckitt, owned by, EPO)}",2
5697,Related articles Mark Carney is Governor of the Bank of England,"[Mark Carney ; Governor of the Bank of England ; position held, Governor of the Bank of England ; Mark Carney ; employer]","{(Mark Carney, Governor of the Bank of England, position held), (Governor of the Bank of England, Mark Carney, employer)}","{(Governor of the Bank of England, Mark Carney, employer, EPO), (Mark Carney, Governor of the Bank of England, position held, EPO)}",2
5698,"E-commerce company eBay Inc is expected to report third-quarter revenue below expectations, according to some analysts.",[eBay ; e-commerce ; industry],"{(eBay, e-commerce, industry)}","{(eBay, e-commerce, industry, NEO)}",1


In [9]:
df['multi_annotations'].value_counts()

multi_annotations
2     2422
1     2273
4      385
3      362
5       96
6       78
8       28
7       23
9       10
12       5
10       5
14       4
11       3
13       2
15       1
27       1
24       1
37       1
Name: count, dtype: int64

In [11]:
def convert_to_tuple_set(result_dict):
    """
    Convert the relation format from the API response to a set of tuples.
    Output format: {(entity1, entity2, relationship), ...}
    
    Parameters:
    - result_dict: The parsed JSON response from the API.
    
    Returns:
    - A set of tuples representing relationships.
    """
    relation_tuples = set()
    
    # Check if the response contains relationships
    if isinstance(result_dict, list):
        for relation in result_dict:
            # Ensure that we have the required fields
            entity1 = relation.get('entity1', '')
            entity2 = relation.get('entity2', '')
            rel = relation.get('relationship', '')
            
            # Only add if we have all three fields
            if entity1 and entity2 and rel:
                relation_tuples.add((entity1, entity2, rel))
    
    return relation_tuples


In [None]:
client = genai.Client(api_key=API_KEY)

class relationships(BaseModel):
    entity1: str
    entity2: str
    relationship: str

response = client.models.generate_content( 
    model='gemini-2.0-flash', 
    contents="""Give me relationship: Yum China will become a franchise of Yum Brands in Mainland China, the parent of KFC, Pizza Hut and Taco Bell chains said.
Warren Buffett's Berkshire Hathaway (Sao Paolo: BERK34F.SA - news ) this month also launched its first cyber policies through its specialty insurance division.
In the wake of last year's attack on Sony Pictures Entertainment, parent Sony Corp said its financial condition could suffer if it were attacked again, since current policies "might not cover all expenses and losses.""", 
    config={ 
        'response_mime_type': 'application/json',
        'response_schema': relationships,
    }, 
 )

response.parsed

relationships(entity1='Yum China', entity2='Yum Brands', relationship='franchise')

In [None]:
response = client.models.generate_content(
    model='gemini-2.0-flash',
    contents="""Extract the text below and provide me with the relationship triplets in this JSON format {(Apple Inc, Steve Jobs, chief_executive_officer), (Apple Inc, Steve Jobs, founded_by)}: Yum China will become a franchise of Yum Brands in Mainland China, the parent of KFC, Pizza Hut and Taco Bell chains said.
Warren Buffett's Berkshire Hathaway (Sao Paolo: BERK34F.SA - news ) this month also launched its first cyber policies through its specialty insurance division.
In the wake of last year's attack on Sony Pictures Entertainment, parent Sony Corp said its financial condition could suffer if it were attacked again, since current policies "might not cover all expenses and losses."""
)
print(response.text)

print(response.model_dump_json(
    exclude_none=True, indent=4))

```json
[
    {
        "subject": "Yum China",
        "relation": "franchise_of",
        "object": "Yum Brands"
    },
    {
        "subject": "Yum Brands",
        "relation": "parent_of",
        "object": "KFC"
    },
     {
        "subject": "Yum Brands",
        "relation": "parent_of",
        "object": "Pizza Hut"
    },
     {
        "subject": "Yum Brands",
        "relation": "parent_of",
        "object": "Taco Bell"
    },
    {
        "subject": "Berkshire Hathaway",
        "relation": "owned_by",
        "object": "Warren Buffett"
    },
    {
        "subject": "Sony Pictures Entertainment",
        "relation": "parent_of",
        "object": "Sony Corp"
    }
]
```
{
    "candidates": [
        {
            "content": {
                "parts": [
                    {
                        "text": "```json\n[\n    {\n        \"subject\": \"Yum China\",\n        \"relation\": \"franchise_of\",\n        \"object\": \"Yum Brands\"\n    },\n    {\n        \"subjec

In [None]:
# Define the data model for the response
class Relationships(BaseModel):
    entity1: str
    entity2: str
    relationship: str

def process_batch_with_genai(df, text_column, prompt, batch_size=10, model='gemini-2.0-flash', max_retries=2):
    """
    Process texts from a dataframe column in batches through the GenAI API
    
    Parameters:
    - df: pandas DataFrame containing the text data
    - text_column: name of the column containing text to process
    - prompt: your instruction prompt to prepend to each text
    - batch_size: number of items to process in each batch
    - model: model identifier for the GenAI API
    - max_retries: maximum number of retries for empty results
    
    Returns:
    - DataFrame with original data and results column added
    """
    # Create a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Add a column for results
    result_df['genai_results'] = None  # Change column name to reflect new API
    
    # Calculate number of batches
    num_batches = (len(df) + batch_size - 1) // batch_size
    
    # Process in batches
    for i in tqdm(range(num_batches), desc="Processing batches"):
        # Get the current batch
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_results = []
        
        # Process each item in the batch
        for idx, row in batch_df.iterrows():
            text = row[text_column]
            
            # Initialize retry counter and result
            retries = 0
            standardized_result = set()
            success = False
            
            while retries <= max_retries and not success:
                try:
                    # Create a more insistent prompt if this is a retry
                    retry_instruction = ""
                    if retries > 0:
                        retry_instruction = "IMPORTANT: The previous analysis found no relationships. Please try again."
                    
                    # Combine prompt with retry instruction if needed
                    current_prompt = f"{prompt}\n\n{retry_instruction}" if retry_instruction else prompt
                    contents = f"{current_prompt}\n\nText to analyze: {text}"
                    
                    # Prepare the request to the GenAI API
                    response = client.models.generate_content(
                        model=model,
                        contents=contents,
                        config={
                            'response_mime_type': 'application/json',
                            'response_schema': Relationships,
                        },
                    )
                    
                    # Access the parsed response
                    json_response = response.parsed
                    
                    # Convert JSON response to a set of tuples
                    standardized_result = convert_to_tuple_set(json_response)
                    
                    if standardized_result and len(standardized_result) > 0:
                        success = True
                    else:
                        print(f"Row {idx}: No relationships found on attempt {retries + 1}. Retrying...")
                        retries += 1
                    
                except Exception as e:
                    print(f"Error processing row {idx} (attempt {retries + 1}): {str(e)}")
                    retries += 1
                    
                    if retries > max_retries:
                        standardized_result = {("error", str(e), "")}
            
            # Add the final result (either successful or after max retries)
            batch_results.append(standardized_result)
        
        # Update the results in the dataframe
        for i, idx in enumerate(range(start_idx, end_idx)):
            if idx < len(result_df):
                result_df.at[idx, 'genai_results'] = batch_results[i]
    
    return result_df

# Define your prompt with stronger emphasis on finding relationships
prompt = f"""
Extract structured financial relationships from the given text.
Each relation in the array should be in the following format:
[entity1, entity2, relationship]

IMPORTANT: You MUST identify at least one relationship in the text.

For example, if Apple Inc was founded by Steve Jobs and Steve Jobs is the CEO:
{{
    "relations": [
        ["Apple Inc", "Steve Jobs", "founded by"],
        ["Apple Inc", "Steve Jobs", "chief executive officer"]
    ]
}}

Ensure that directional relationships are maintained:
- 'product or material produced' should be mapped to 'manufacturer' and vice versa.
- 'position held' should be mapped to 'employer' and vice versa.
- 'owned by' should be mapped to 'owner of' and vice versa.
- 'parent organization' should be mapped to 'subsidiary' and vice versa.

Look for both:
- Direct relationships (e.g., 'Company A owns Company B')
- Indirect relationships (e.g., 'Company A, a subsidiary of Company B, operates under its parent organization')
- Implied relationships (e.g., 'John, CEO of Company X, announced...' implies a 'chief executive officer' relationship)

Even if the relationship is subtle or implied, you must extract it.
"""

# Function to split the dataframe into chunks
def split_dataframe_into_chunks(df, chunk_size=50):
    """Split a dataframe into chunks of specified size"""
    chunks = []
    num_chunks = (len(df) + chunk_size - 1) // chunk_size
    
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df))
        chunks.append(df.iloc[start_idx:end_idx].copy())
    
    return chunks

# Main execution
if __name__ == "__main__":
    # Split the dataframe into chunks
    df_chunks = split_dataframe_into_chunks(df, chunk_size=6)
    
    # Process just the first chunk
    if df_chunks:
        first_chunk_df = df_chunks[0]
        result_df = process_batch_with_genai(first_chunk_df, 'text', prompt, batch_size=3, max_retries=2)


Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]

Row 0: No relationships found on attempt 1. Retrying...
Row 0: No relationships found on attempt 2. Retrying...
Row 0: No relationships found on attempt 3. Retrying...
Row 1: No relationships found on attempt 1. Retrying...
Row 1: No relationships found on attempt 2. Retrying...
Row 1: No relationships found on attempt 3. Retrying...
Row 2: No relationships found on attempt 1. Retrying...
Row 2: No relationships found on attempt 2. Retrying...


Processing batches:  50%|█████     | 1/2 [00:06<00:06,  6.40s/it]

Row 2: No relationships found on attempt 3. Retrying...
Row 3: No relationships found on attempt 1. Retrying...
Row 3: No relationships found on attempt 2. Retrying...
Row 3: No relationships found on attempt 3. Retrying...
Row 4: No relationships found on attempt 1. Retrying...
Row 4: No relationships found on attempt 2. Retrying...
Row 4: No relationships found on attempt 3. Retrying...


Processing batches: 100%|██████████| 2/2 [00:11<00:00,  5.85s/it]

Row 5: No relationships found on attempt 1. Retrying...
Error processing row 5 (attempt 2): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.0-flash', 'location': 'global'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '31s'}]}}
Error processing row 5 (attempt 3): 429 RESOURCE_EXH


