In [None]:
import os
import pandas as pd
from mistralai import Mistral
import jsonlines
import json
from dotenv import load_dotenv
load_dotenv() 


In [None]:
# 1. Let's get a sample of data

In [None]:
api_key = os.environ["MIXTRAL"]
model = "mistral-large-latest"

In [None]:
df = pd.read_excel("../data/sample_llm.xlsx", engine='openpyxl')

In [None]:
df.shape

In [None]:
def make_message_description_instrument(title, abstract):
    title = title.replace("\"", "")
    abstract = abstract.replace("\"", "")
    return [
                {
                    "role": "system",
                    "content": "You are a policy researcher.",
                },
                {
                    "role": "user",
                    "content": f"""
                    You are asked to determined whether an academic publication, given its title and and abstract, is relevant  to the policy studies community. To assess this, please follow the following criteria:

                    1. The title is representative of the content of the abstract.
                    2. Its relevance to the policy studies community is defined as containing information if it advances theoretical, methodological, or empirical understanding of policy processes, instruments, actors, or institutions in the field.


                    You will receive the information in the following format:
                    ### Example input ###
                    TITLE: <title>
                    ABSTRACT: <abstract>
                    ### End example input ###

                    Your answer should be ONLY as "YES" or "NO".

                    ### Example positive answer ###
                    ANSWER: YES
                    ### End example positive answer ###

                    ### Example negative answer ###
                    ANSWER: NO
                    ### End example negative answer ###

                    Given the following abstract from and academic publication:
                     
                    TITLE: {title}
                    ABSTRACT: {abstract}

                    Is this abstract relevant to the policy studies community? 

                    ANSWER:""",
                },
            ]

In [None]:
def get_information_for_mixtral(id, title, abstract):
    message = make_message_description_instrument(title, abstract)
    output_file = f"../data/for_llm/data.jsonl"
    with jsonlines.open(output_file, 'a') as writer:
        writer.write({"custom_id": str(id), "body": {"max_tokens": 50, "messages": message}})
    return

In [None]:
df.apply(lambda x: get_information_for_mixtral(x.index_new, x.Title, x.Abstract), axis=1)

In [None]:
client = Mistral(api_key=api_key)

In [None]:
batch_data = client.files.upload(
    file={
        "file_name": f"data_for_llm",
        "content": open(f"../data/for_llm/data.jsonl", "rb")},
    purpose = "batch"
    
)

created_job = client.batch.jobs.create(
    input_files=[batch_data.id],
    model="mistral-large-latest",
    endpoint="/v1/chat/completions",
    metadata={"job_type": "response"}
)

In [None]:
file_id = created_job.id

In [None]:
list_job = client.batch.jobs.list()
list_job.data[0]

In [None]:
output_file = list_job.data[0].output_file
output_file

In [None]:
response = client.files.download(file_id=output_file)

In [None]:
with open("../data/for_llm/results.jsonl", "w") as f:
    for chunk in response.stream:
        f.write(chunk.decode("utf-8"))

In [None]:
def retrieve_json(file_path='../data/for_llm/results.jsonl'):
    tbr = {}
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    for i in data:
        id = i["custom_id"]
        choice = i["response"]["body"]["choices"][0]["message"]["content"]
        if "yes" in choice.lower():
            tbr[id] = 1
        else:
            tbr[id] = 0 
    return tbr       
    
is_it_relevant = retrieve_json()

In [None]:
df["is_relevant"] = df["index_new"].apply(lambda x: is_it_relevant[str(x)])

In [None]:
df.is_relevant.value_counts()

In [None]:
df.to_excel("../data/sample_llm_assessed.xlsx")