In [82]:
from groq import Groq
import openai
import os
import time
from tqdm import tqdm
import json
import re
from dotenv import load_dotenv
from pathlib import Path

# Load the .env file
load_dotenv("./.env")

# client = Groq(
#     api_key=os.getenv('GROQ_API_KEY')
# )

client = openai.OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ.get("GROQ_API_KEY")
)

# Define the function to generate a prompt
entity_types=[
    "Material",
    "ChemicalCompound",
    "DepositionMethod",
    "PassivationTechnique",
    "SolarCellType",
    "Layer",
    "PerformanceMetric",
    "EfficiencyMetric",
    "ElectricalProperty",
    "OpticalProperty",
    "CharacterizationTechnique",
    "DefectType",
    "DegradationMechanism",
    "EnvironmentalFactor",
    "IndustryStandard",
    "TestingProtocol",
    "ProductionEquipment",
    "ProductionScale",
    "ResearchInstitution"
]

def generate_prompt(text_chunk, requirements):
    # Start with a base instruction
    prompt = (
        "Read the following text and extract the relationships between entities. Please use short-term verbs for relationship representation."
    )
    
    # Add each requirement to the prompt
    # prompt += ", ".join(requirements) + ".\n\n"
    prompt += (
        f"""Provide the output in JSON format. The JSON should have two main keys: "text_chunk" and "relationships". "text_chunk" is given text chunk. "relationships" is a list of dictionaries, each containing "source", "relationship", and "target" keys, representing the relationship between two entities.\n\n"""
        f"Please avoid additonal explanations, just output the JSON."
        f"Text: \"{text_chunk}\"\n\nOutput:\n###\n"
    )

    return prompt


# Function to extract relationships using OpenAI API
def extract_relationships(text_chunk, index):
    prompt = generate_prompt(text_chunk, entity_types)
    response =  client.chat.completions.create(
        model="llama-3.1-70b-versatile",  # or "gpt-3.5-turbo" depending on your access
        messages=[
            {"role": "system", "content": "You are an assistant that extracts entities and relationships from text using graphRAG technologies, and professional in the field of material science and solar cell fabrication."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=7000,  # Adjust based on the expected length of response
        temperature=0.2
    )

    # Extract and return the response text
    output = response.choices[0].message.content
    return output

# Test with a sample text chunk
# text_chunk = "p-type, n-type and upgraded metallurgical\ngrade crystalline silicon solar cells, respectively. Especially, the regeneration of boron-oxygen related defects,\nwhich cause carrier induced degradation, will be closely discussed since most of industrial solar cells are fabricated by boron-doped p-type silicon wafer. Moreover, laser-induced hydrogen passivation, which can locally\nrecover defective area on the solar cells, will be addressed. In the conclusion, proper conditions of advanced\nhydrogen passivation for the successful improvement of minority carrier lifetime will be summarized.\ncrystalline silicon [5,9–15]. Moreover, it has been reported that boronoxygen (B-O) complex, which leads to carrier-induced degradation\n(CID) in boron doped p-type silicon wafers, can be deactivated effectively by using hydrogen atoms in the silicon nitride layer [16–21].\nFor effective hydrogen passivation of defects in crystalline silicon,\nlocally stable charge states of interstitial hydrogen was"
# relationships = extract_relationships(text_chunk)
# print(relationships)

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_text_into_chunks(text, max_chunk_size=200, overlap_size=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + max_chunk_size, len(words))
        chunk = words[start:end]
        chunks.append(' '.join(chunk))
        start += max_chunk_size - overlap_size
    return chunks

def create_json_file(file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump([], file, indent=4)  # Initialize with an empty list

def extract_json_from_markdown(response):
    # Use a regular expression to find JSON content inside generic Markdown code fences
    json_pattern = r'```(.*?)```'
    match = re.search(json_pattern, response, re.DOTALL)
    
    if match:
        json_content = match.group(1).strip()
        try:
            # Parse the JSON content into a Python dictionary
            json_object = json.loads(json_content)
            return json_object
        except json.JSONDecodeError as e:
            # print(f"Error decoding JSON: {e}")
            return None
    else:
        print("No JSON code fence found in the response.")
        return None

def append_to_json_file(file_path, new_object):
    # Read existing data
    with open(file_path, 'r+', encoding='utf-8') as file:
        data = json.load(file)  # Load existing data

        # Extract the JSON object
        json_object = extract_json_from_markdown(new_object)
        if json_object:
            # print("Extracted JSON object:", json_object)
                    # Append new object
            data.append(json_object)
                    # Move the file pointer to the beginning of the file
            file.seek(0)
            
            # Write updated data
            json.dump(data, file)
            
            # Truncate the file in case the new data is shorter
            file.truncate()
        else:
            return None


def process_file(input_file_path, output_file_path):
    text = read_text_file(input_file_path)
    chunks = split_text_into_chunks(text)

    create_json_file(output_file_path)

    index = 0
    for chunk in tqdm(chunks, desc="processing"):
        index += index
        result = extract_relationships(chunk, index)
        append_to_json_file(output_file_path, result)
        time.sleep(7)
    print('done')



# Get the current script's directory
script_directory = Path(os.getcwd())
# define paths
file_name = "10.1088@1361-6463@ac9066.txt"
text_path = script_directory.parent / "input" / file_name
json_path = script_directory.parent / "json_output" / file_name.replace('.txt', '.json')

# process the file
process_file(text_path, json_path)


processing:  38%|███▊      | 72/188 [15:14<24:32, 12.70s/it]


InternalServerError: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}