In [None]:
import json
import csv
import pandas as pd
import random
import numpy as np
import google.generativeai as genai
import json
import time
import re

# JSON to CSV Conversion

In [2]:
def parse_edges_to_csv(input_file, output_file):
    """
    Parse edges.jsonl and extract subject, predicate, and object columns to CSV
    """
    with open(input_file, 'r') as jsonl_file, open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        
        # Write header
        writer.writerow(['subject', 'predicate', 'object'])
        
        # Process each line in the JSONL file
        for line in jsonl_file:
            try:
                data = json.loads(line.strip())
                subject = data.get('subject', '')
                predicate = data.get('predicate', '')
                obj = data.get('object', '')
                
                writer.writerow([subject, predicate, obj])
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line}")
                continue

In [3]:
input_file = "example_edges.jsonl"
output_file = "edges_output.csv"
    
parse_edges_to_csv(input_file, output_file)
print(f"CSV file created: {output_file}")

CSV file created: edges_output.csv


# Sub graph Preparation (random predicate removal)

In [4]:
def select_chunk_and_remove_predicates(input_csv, chunk_size=100, predicate_removal_percent=50, output_file='modified_chunk.csv'):
    """
    Select a random chunk from the CSV and remove a percentage of edges for each unique predicate.
    
    Args:
        input_csv: Path to the input CSV file
        chunk_size: Number of rows to select (default: 100)
        predicate_removal_percent: Percentage of edges to remove for each unique predicate (default: 50)
        output_file: Path to save the modified chunk
    
    Returns:
        tuple: (original_chunk_df, modified_chunk_df)
    """
    # Read the CSV
    df = pd.read_csv(input_csv)
    
    # Select a random chunk
    if chunk_size >= len(df):
        chunk_df = df.copy()
    else:
        start_idx = random.randint(0, len(df) - chunk_size)
        chunk_df = df.iloc[start_idx:start_idx + chunk_size].copy()
    
    # Store original chunk
    original_chunk = chunk_df.copy()
    modified_chunk = chunk_df.copy()
    
    # Get unique predicates in the chunk
    unique_predicates = modified_chunk['predicate'].unique()
    
    total_removed = 0
    
    # Remove specified percentage of edges for each unique predicate
    for predicate in unique_predicates:
        predicate_indices = modified_chunk[modified_chunk['predicate'] == predicate].index.tolist()
        num_to_remove_pred = int(len(predicate_indices) * (predicate_removal_percent / 100))
        
        if num_to_remove_pred > 0:
            indices_to_remove_pred = random.sample(predicate_indices, num_to_remove_pred)
            modified_chunk.loc[indices_to_remove_pred, 'predicate'] = ''
            total_removed += num_to_remove_pred
    
    # Save modified chunk to CSV
    modified_chunk.to_csv(output_file, index=False)
    
    print(f"Original chunk size: {len(original_chunk)}")
    print(f"Removed {predicate_removal_percent}% of edges for each unique predicate")
    print(f"Total edges with predicates removed: {total_removed}")
    print(f"Modified chunk size: {len(modified_chunk)}")
    print(f"Modified chunk saved to: {output_file}")
    
    return original_chunk, modified_chunk

In [5]:
original, modified = select_chunk_and_remove_predicates(
    'edges_output.csv',
    chunk_size=100,
    predicate_removal_percent=50,
    output_file='modified_chunk_50%_removed.csv'
)

Original chunk size: 100
Removed 50% of edges for each unique predicate
Total edges with predicates removed: 42
Modified chunk size: 100
Modified chunk saved to: modified_chunk_50%_removed.csv


In [6]:
print("Original Chunk:")
print(original.head())
print("\nModified Chunk:")
print(modified.head())
print("\nRemoved Rows:")


Original Chunk:
             subject          predicate         object
1322  UBERON:0003092   biolink:has_part   PR:000027222
1323     CHEBI:39867     biolink:causes  UMLS:C1868980
1324     CHEBI:39867     biolink:causes  UMLS:C1868980
1325      GO:0004714  biolink:regulates   NCBIGene:207
1326   UMLS:C0023775   biolink:disrupts     GO:0006811

Modified Chunk:
             subject         predicate         object
1322  UBERON:0003092  biolink:has_part   PR:000027222
1323     CHEBI:39867                    UMLS:C1868980
1324     CHEBI:39867                    UMLS:C1868980
1325      GO:0004714                     NCBIGene:207
1326   UMLS:C0023775  biolink:disrupts     GO:0006811

Removed Rows:


# Random Edge Assignment

In [7]:
# function to randomly assign edges to nodes from the list of unique predicates 
def randomly_assign_edges(input_csv, unique_predicates, output_file='randomly_assigned_edges.csv'):
    """
    Randomly assign edges to nodes from the list of unique predicates and save to new CSV.
    
    Args:
        input_csv: Path to the input CSV file
        unique_predicates: List of unique predicates
        output_file: Path to save the new CSV with randomly assigned edges
    """
    df = pd.read_csv(input_csv)
    
    # Fill empty predicates with random choices from unique_predicates
    for idx, row in df.iterrows():
        if pd.isna(row['predicate']) or row['predicate'] == '' or str(row['predicate']).strip() == '':
            df.at[idx, 'predicate'] = random.choice(unique_predicates)
    
    # Save to new CSV
    df.to_csv(output_file, index=False)
    print(f"Randomly assigned edges saved to: {output_file}")
    
    
    return df

In [8]:
# list of all unique predicates in the dataset which are not empty
unique_predicates = modified['predicate'].unique()
unique_predicates = unique_predicates[unique_predicates != '']
print("Unique predicates found:")
print(unique_predicates)
print(len(unique_predicates))

# Run the improved function
result_df = randomly_assign_edges('modified_chunk_50%_removed.csv', unique_predicates, output_file='randomly_assigned_edges.csv')



Unique predicates found:
['biolink:has_part' 'biolink:disrupts' 'biolink:correlated_with'
 'biolink:positively_correlated_with' 'biolink:causes' 'biolink:regulates'
 'biolink:coexists_with' 'biolink:located_in' 'biolink:contraindicated_in'
 'biolink:capable_of' 'biolink:negatively_correlated_with'
 'biolink:occurs_in' 'biolink:develops_from'
 'biolink:has_active_ingredient'
 'biolink:treats_or_applied_or_studied_to_treat' 'biolink:affects'
 'biolink:increases_response_to' 'biolink:genetically_associated_with'
 'biolink:overlaps' 'biolink:predisposes_to_condition'
 'biolink:expressed_in' 'biolink:ameliorates_condition'
 'biolink:contributes_to' 'biolink:decreases_response_to' 'biolink:treats'
 'biolink:subclass_of' 'biolink:similar_to' 'biolink:has_participant']
28
Randomly assigned edges saved to: randomly_assigned_edges.csv


# Gemini LLM Edge Assignment

In [9]:
api = "AIzaSyCQVqiw_JyVbMrko4TpplqS0bf2GJCtgr8"

def fill_missing_predicates_llm_base(input_df, unique_predicates, output_file='llm_filled_predicates.csv', 
                                    metrics_file='llm_metrics.json', responses_file='llm_responses.json'):
    """
    Use Gemini API to fill in missing predicates in the DataFrame using a single batch prompt.
    
    Args:
        input_df: DataFrame with potential missing predicates
        unique_predicates: List of unique predicates to choose from
        output_file: Path to save the new CSV with LLM filled predicates
        metrics_file: Path to save metrics about the LLM usage
        responses_file: Path to save all LLM responses for analysis
    
    Returns:
        tuple: (filled_df, metrics, responses)
    """
    # Configure Gemini API
    genai.configure(api_key=api)
    model = genai.GenerativeModel('gemini-2.5-flash')
    
    df = input_df.copy()
    start_time = time.time()
    
    # Find all empty predicate rows
    empty_mask = df['predicate'].isna() | (df['predicate'] == '') | (df['predicate'].str.strip() == '')
    empty_indices = df[empty_mask].index.tolist()
    empty_count = len(empty_indices)
    
    print(f"Found {empty_count} empty predicates to fill")
    
    if empty_count == 0:
        print("No empty predicates found!")
        return df, {}, []
    
    # Build single large prompt with all missing predicates
    predicate_list = ', '.join(unique_predicates)
    
    batch_prompt = f"""You are a biomedical knowledge graph expert. Complete the missing predicates for these triples.

Available predicates: {predicate_list}

Instructions: For each numbered triple, respond with ONLY the most appropriate predicate from the list above.

Triples to complete:
"""
    
    # Add all empty predicate cases to the prompt
    case_mapping = {}  # Maps case number to dataframe index
    for case_num, idx in enumerate(empty_indices, 1):
        row = df.iloc[idx]
        batch_prompt += f"{case_num}. Subject: {row['subject']} | Object: {row['object']}\n"
        case_mapping[case_num] = idx
    
    batch_prompt += f"""
Expected response format:
1. predicate_name
2. predicate_name
3. predicate_name
...

Respond with ONLY the numbered list of predicates, nothing else."""

    print(f"Sending batch request for {empty_count} predicates...")
    
    llm_filled_count = 0
    fallback_count = 0
    successful_requests = 0
    failed_requests = 0
    
    try:
        # Single API request for all missing predicates
        response = model.generate_content(
            batch_prompt,
            generation_config=genai.types.GenerationConfig(
                max_output_tokens=empty_count * 10,  # Adjust based on number of predicates
                temperature=0.3,
                candidate_count=1
            )
        )
        
        response_text = response.text.strip()
        print("✓ Batch API request successful")
        successful_requests = 1
        
        # Parse the response to extract individual predicates
        response_lines = response_text.split('\n')
        
        # Use regex to extract numbered responses
        predicate_suggestions = {}
        for line in response_lines:
            line = line.strip()
            if line:
                # Match patterns like "1. biolink:treats" or "1) biolink:treats" or "1 biolink:treats"
                match = re.match(r'^(\d+)[\.\)\s]+(.+)', line)
                if match:
                    case_num = int(match.group(1))
                    suggested_predicate = match.group(2).strip()
                    
                    # Clean up the suggestion (remove quotes, extra text)
                    suggested_predicate = suggested_predicate.replace('"', '').replace("'", "")
                    
                    # Try exact match first
                    if suggested_predicate in unique_predicates:
                        predicate_suggestions[case_num] = suggested_predicate
                    else:
                        # Try partial matching
                        for predicate in unique_predicates:
                            if predicate in suggested_predicate or suggested_predicate in predicate:
                                predicate_suggestions[case_num] = predicate
                                break
        
        print(f"✓ Successfully parsed {len(predicate_suggestions)} predicates from response")
        
        # Apply the suggestions to the dataframe
        for case_num, idx in case_mapping.items():
            if case_num in predicate_suggestions:
                suggested_predicate = predicate_suggestions[case_num]
                df.at[idx, 'predicate'] = suggested_predicate
                llm_filled_count += 1
                print(f"✓ Row {idx}: Filled with '{suggested_predicate}'")
            else:
                # Fallback to random selection
                fallback_predicate = random.choice(unique_predicates)
                df.at[idx, 'predicate'] = fallback_predicate
                fallback_count += 1
                print(f"⚠ Row {idx}: No suggestion found, used random '{fallback_predicate}'")
        
        # Estimate token usage
        input_tokens = len(batch_prompt.split()) * 1.3
        output_tokens = len(response_text.split()) * 1.3
        
        # Store response details
        responses = [{
            'batch_request': True,
            'total_cases': empty_count,
            'prompt': batch_prompt,
            'response_text': response_text,
            'parsed_suggestions': predicate_suggestions,
            'case_mapping': case_mapping,
            'success': True,
            'estimated_input_tokens': input_tokens,
            'estimated_output_tokens': output_tokens
        }]
        
    except Exception as e:
        print(f"✗ Batch API request failed: {e}")
        failed_requests = 1
        
        # Fallback: fill all with random predicates
        for idx in empty_indices:
            fallback_predicate = random.choice(unique_predicates)
            df.at[idx, 'predicate'] = fallback_predicate
            fallback_count += 1
        
        responses = [{
            'batch_request': True,
            'total_cases': empty_count,
            'prompt': batch_prompt,
            'error': str(e),
            'success': False,
            'fallback_used': True
        }]
        
        input_tokens = len(batch_prompt.split()) * 1.3
        output_tokens = 0
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # Create metrics summary
    metrics = {
        'total_empty_predicates': empty_count,
        'llm_filled_count': llm_filled_count,
        'fallback_count': fallback_count,
        'successful_requests': successful_requests,
        'failed_requests': failed_requests,
        'total_requests': successful_requests + failed_requests,
        'success_rate': successful_requests / (successful_requests + failed_requests) if (successful_requests + failed_requests) > 0 else 0,
        'llm_success_rate': llm_filled_count / empty_count if empty_count > 0 else 0,
        'total_processing_time_seconds': total_time,
        'estimated_total_input_tokens': input_tokens,
        'estimated_total_output_tokens': output_tokens,
        'estimated_total_tokens': input_tokens + output_tokens,
        'batch_processing': True,
        'speed_improvement': f"~{empty_count}x faster than individual requests"
    }
    
    # Save files
    df.to_csv(output_file, index=False)
    
    with open(metrics_file, 'w') as f:
        json.dump(metrics, f, indent=2)
    
    with open(responses_file, 'w') as f:
        json.dump(responses, f, indent=2)
    
    # Print summary
    print(f"\n=== Batch LLM Processing Complete ===")
    print(f"LLM filled predicates saved to: {output_file}")
    print(f"Total predicates filled by LLM: {llm_filled_count}/{empty_count}")
    print(f"Fallback (random) assignments: {fallback_count}")
    print(f"LLM success rate: {metrics['llm_success_rate']:.2%}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Estimated tokens used: {int(input_tokens + output_tokens)}")
    print(f"Speed improvement: ~{empty_count}x faster than individual requests!")
    print(f"Metrics saved to: {metrics_file}")
    print(f"Responses saved to: {responses_file}")
    
    return df, metrics, responses

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Test the LLM function with a subset of data first - Fix the indexing issue
test_df = modified.copy().reset_index(drop=True)  # Reset index to 0, 1, 2, 3...

print(f"DataFrame shape: {test_df.shape}")
print(f"Index range: {test_df.index.min()} to {test_df.index.max()}")

filled_df, metrics, responses = fill_missing_predicates_llm_base(
    test_df,
    unique_predicates,
    output_file='gemini_filled_test.csv',
    metrics_file='gemini_metrics_test.json',
    responses_file='gemini_responses_test.json'
)

DataFrame shape: (100, 3)
Index range: 0 to 99
Found 42 empty predicates to fill
Sending batch request for 42 predicates...


E0000 00:00:1759347252.225361  433971 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


✓ Batch API request successful
✓ Successfully parsed 42 predicates from response
✓ Row 1: Filled with 'biolink:treats'
✓ Row 2: Filled with 'biolink:treats'
✓ Row 3: Filled with 'biolink:regulates'
✓ Row 6: Filled with 'biolink:subclass_of'
✓ Row 8: Filled with 'biolink:regulates'
✓ Row 9: Filled with 'biolink:subclass_of'
✓ Row 16: Filled with 'biolink:occurs_in'
✓ Row 18: Filled with 'biolink:affects'
✓ Row 22: Filled with 'biolink:subclass_of'
✓ Row 27: Filled with 'biolink:positively_correlated_with'
✓ Row 32: Filled with 'biolink:occurs_in'
✓ Row 37: Filled with 'biolink:occurs_in'
✓ Row 38: Filled with 'biolink:subclass_of'
✓ Row 40: Filled with 'biolink:occurs_in'
✓ Row 41: Filled with 'biolink:correlated_with'
✓ Row 43: Filled with 'biolink:has_part'
✓ Row 45: Filled with 'biolink:occurs_in'
✓ Row 46: Filled with 'biolink:affects'
✓ Row 47: Filled with 'biolink:contributes_to'
✓ Row 49: Filled with 'biolink:similar_to'
✓ Row 52: Filled with 'biolink:similar_to'
✓ Row 54: Filled

# in progress

In [None]:
import requests
import gzip
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# URLs for the .gz files (tool descriptions)
urls = [
    "https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3/bioconcepts2pubtator3.gz",
    "https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3/gene2pubtator3.gz"
]

local_files = ["bioconcepts2pubtator3.gz", "gene2pubtator3.gz"]

# Function to download the .gz files from URLs
def download_file(url, local_path):
    response = requests.get(url)
    with open(local_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded file: {local_path}")

# Download the files
# for url, local_file in zip(urls, local_files):
#     download_file(url, local_file)

# Initialize ChromaDB client (new method)
client = chromadb.Client()

# Create or get the collection
collection = client.create_collection("pubtator_data")

# Load the transformer model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize text splitter (chunk text into smaller pieces)
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

def embed_text(text):
    """Embed text into vector using pre-trained model."""
    embedding = model.encode(text)
    return embedding

# Function to read and process the .gz files
def process_gz_file(file_path):
    with gzip.open(file_path, 'rt') as f:
        return f.readlines()

# Process and insert the data into ChromaDB
for file in local_files:
    print(f"Processing file: {file}")
    lines = process_gz_file(file)
    
    # Split lines into smaller chunks
    for line in lines:
        chunks = splitter.split_text(line.strip())  # Split long descriptions into chunks
        
        # For each chunk, generate embedding and store in ChromaDB
        for chunk in chunks:
            embedding = embed_text(chunk)
            
            # Add document and metadata (such as source file and chunk position) to ChromaDB
            collection.add(
                documents=[chunk],
                metadatas=[{"source": file}],
                embeddings=[embedding]
            )

print("Data inserted into ChromaDB.")

# Query ChromaDB for relevant documents (example query)
query = "What is gene expression in biological research?"
query_embedding = embed_text(query)

# Perform the retrieval to get top-k similar documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # Retrieve top 5 similar documents
)

print("Retrieved documents:")
for doc, metadata in zip(results["documents"], results["metadatas"]):
    print(f"Document: {doc} (Source: {metadata['source']})")
