# 1. Imports

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from openai import AsyncOpenAI

import pandas as pd
import os
import json
import asyncio
from aiolimiter import AsyncLimiter
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [None]:
csv_file = "../temp_data/extracted_text.csv"

# 2. Data Preperation

In [None]:
df = pd.read_csv(csv_file)

# Ensure the 'Text' column is treated as string
df['Text'] = df['Text'].astype(str)

# Add a 'Country' column based on the folder name in 'FilePath'
df['Country'] = df['FilePath'].apply(lambda x: os.path.basename(os.path.dirname(x)))

# Sort the DataFrame by 'FilePath' and 'PageNumber' to ensure order
df.sort_values(by=['FilePath', 'PageNumber'], inplace=True)

# Function to concatenate texts by every up to three pages and include location
def concatenate_texts(group):
    texts = group['Text'].tolist()
    location = group['Country'].iloc[0]
    # Create list of dicts with location and concatenated text for up to three pages
    grouped_texts = [{'location': location, 'text': 'Location: ' + location + ' ' + ' '.join(texts[i:i+3]) + ' Location: ' + location} for i in range(0, len(texts), 3)]
    return grouped_texts

# Apply the function to each group of pages from the same document
grouped = df.groupby('FilePath').apply(concatenate_texts).reset_index(level=0, drop=True)

# Flatten the list of lists to get a single list of dictionaries
concatenated_texts_json = [item for sublist in grouped for item in sublist]

# Convert the list of dictionaries to a JSON string
json_output = json.dumps(concatenated_texts_json, indent=4)

# Save the JSON output to a file
with open('../temp_data/concatenated_texts.json', 'w') as json_file:
    json_file.write(json_output)


In [None]:
# def sample_docs(row):
#     if "usa" in str(row["location"]):
#         return row.sample(n=5, random_state=1)
#     else:
#         return row.sample(n=25, random_state=1)
        

# documents_df = pd.read_json('../temp_data/concatenated_texts.json')

# # group by location and select 25 random samples from each group
# selected_documents = (
#     documents_df.groupby("location")
#     .apply(sample_docs)
#     .reset_index(drop=True)
# )

# save the selected documents as a JSON file
with open('../temp_data/selected_documents.json', 'w') as json_file:
    selected_documents.to_json(json_file, orient="records", indent=4)


# Load the selected documents
with open('../temp_data/selected_documents.json') as f:
    selected_documents = json.load(f)

selected_documents



# 3. Question generation

In [None]:
client = AsyncOpenAI(api_key="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")

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20))
async def json_evaluation_multiple_seasons(text):
    text = json.dumps(text)
    # Remove quotes at the beginning and the end since json.dumps adds them
    text = text[1:-1]
    
    response = await client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": """Formulate questions of practical relevance to farmers using documents provided by the user to assess the practical knowledge of a student about agriculture-related topics.
    Add location (state, country, or other location information) and crop information to each question, as detailed as possible from the context.
    Please formulate 5 ENGLISH question answer pairs in json format to assess knowledge of the text in the user message. Translate all terms from the text into english. ONLY use information presented in the text! Be detailed in your answers. Try to generate questions of practical relevance to farmers. 
    Make sure to avoid ambiguous questions, be specific on what is expected in the answer. If possible, specify the unit expected in the answer, e.g. kg/ha, cm, etc. Note that the test taker can only see one question at a time and has no access to the document, so each question should be self-contained. NEVER mention the document directly!

    If the document includes no relevant information whatsoever, please return an EMPTY document!

    Output JSON format, 5 questions in total:
    {questions: [{"question": "your_question1", "answer": "your_answer1"}, {"question": "your_question2", "answer": "your_answer2"}, ...]}

    If no relevant information return as last resort:
    {}

    ALWAYS include location and crop information in your questions."""},
        {"role": "user", "content": text}
    ]
    )
    return response.choices[0].message.content

completion = await json_evaluation_multiple_seasons(selected_documents[20])
print(completion)

In [None]:
print(json.loads(completion)["questions"])

In [None]:
# Initialize AsyncLimiter: 100 operations per minute means approximately 1.67 operations per second.
limiter = AsyncLimiter(20, 1)


questions = []

async def process_document(document):
    async with limiter:
        # Replace the synchronous chat call with your async function
        completion = await json_evaluation_multiple_seasons(document)
        location = document["location"]
        try:
            completion = json.loads(completion)["questions"]
            for subdoc in completion:
                subdoc['location'] = location
        except Exception as e:
            print("##############################################################################################################")
            print(completion)
            return []
        return completion
    return []

async def process_documents(documents):
    tasks = [process_document(doc) for doc in documents]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for result in results:
        if isinstance(result, Exception):
            print(f"An error occurred: {result}")
        else:
            questions.extend(result)

loop = asyncio.get_event_loop()

# In case the loop is already running, avoid using loop.run_until_complete()
if not loop.is_running():
    loop.run_until_complete(process_documents(selected_documents))
else:
    await process_documents(selected_documents) 


with open('../temp_data/questions.json', 'w') as f:
    json.dump(questions, f, indent=4)

print(questions)

In [None]:
len(questions)