In [1]:
import os
import json
import jsonlines
from time import sleep
from openai import Client
from dotenv import load_dotenv

In [2]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
OPENAI_ASSISTANT_ID = os.environ["OPENAI_ASSISTANT_ID"] = os.getenv("OPENAI_ASSISTANT_ID")

In [3]:
with open('instructions.txt', 'r') as f:
    instructions = f.read()

In [4]:
client = Client(api_key=OPENAI_API_KEY)

In [5]:
assistant = client.beta.assistants.retrieve(
    assistant_id=OPENAI_ASSISTANT_ID
)
assistant.instructions = instructions

assistant

Assistant(id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', created_at=1702528907, description=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq", you want to express as satisfiying "antireq" logic, in other words, you do not use "exclude_courses".\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given text. Course code is always in the format of "ABCD".\n\nYou can use logical expressions "and" and "

## Conversion

In [6]:
with jsonlines.open("data/course-info.jsonl") as reader:
    course_info = list(reader)


In [7]:
thread = client.beta.threads.create()
thread

Thread(id='thread_nd818Oszpj7zxlhPG0adWAlQ', created_at=1703566280, metadata={}, object='thread')

In [8]:
# Prepare content

courses_list = []
courses_list = course_info[2911:2916]

content = "\n".join(map(lambda x: str(x), courses_list))
print(content)

{'cid': 39871, 'code': 'CORE', 'number': 597, 'topic': 'Practicum I in Community Rehabilitation for Distance Learners', 'description': 'Students will complete a project in the area of program or service development (e.g. needs analysis, developing funding proposals, program evaluation). In the seminars, students will be supported in the completion of agency-based program development. Content on professional ethics will also be covered.', 'sub_topics': None, 'units': 3.0, 'credits': None, 'hours': ['2T/4-10'], 'time_length': None, 'prereq': 'Admission to the BCR-C distance program.', 'coreq': None, 'antireq': 'Credit for <course cid="39871">CORE 597</course> and 589.06 (Block Practicum in Community Rehabilitation) will not be allowed.', 'notes': 'Course is normally taken in combination with <course cid="42570">Community Rehabilitation 598</course> in the same academic year.', 'aka': None, 'repeat': False, 'nogpa': True}
{'cid': 42570, 'code': 'CORE', 'number': 598, 'topic': 'Practicum I

In [9]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=content,
)

message

ThreadMessage(id='msg_nVSMHsscGIvsJw7n4IeXJ5uY', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='{\'cid\': 39871, \'code\': \'CORE\', \'number\': 597, \'topic\': \'Practicum I in Community Rehabilitation for Distance Learners\', \'description\': \'Students will complete a project in the area of program or service development (e.g. needs analysis, developing funding proposals, program evaluation). In the seminars, students will be supported in the completion of agency-based program development. Content on professional ethics will also be covered.\', \'sub_topics\': None, \'units\': 3.0, \'credits\': None, \'hours\': [\'2T/4-10\'], \'time_length\': None, \'prereq\': \'Admission to the BCR-C distance program.\', \'coreq\': None, \'antireq\': \'Credit for <course cid="39871">CORE 597</course> and 589.06 (Block Practicum in Community Rehabilitation) will not be allowed.\', \'notes\': \'Course is normally taken in combination with <course cid="42570">Community

In [10]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions=instructions,
)

run

Run(id='run_7eU2uV3vBNXaJTg3Km9oP5a9', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=None, created_at=1703566280, expires_at=1703566880, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq", you want to express as satisfiying "antireq" logic, in other words, you do not use "exclude_courses".\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given t

### Result

In [11]:
is_complete = False

# Wait until complete
while not is_complete:
    run = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id,
    )

    is_complete = run.status == "completed"

    if not is_complete:
        print("Waiting for completion...")
        sleep(10)

run

Waiting for completion...
Waiting for completion...


Run(id='run_7eU2uV3vBNXaJTg3Km9oP5a9', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=1703566294, created_at=1703566280, expires_at=None, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq", you want to express as satisfiying "antireq" logic, in other words, you do not use "exclude_courses".\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given t

In [12]:
messages = client.beta.threads.messages.list(
    thread_id=thread.id,
)

messages

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_3ULGCtqvtxCrBBKcKpH3f1hg', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": 39871, "prereq": {"type": "admission", "conditions": "Admission to the BCR-C distance program."}, "antireq": {"type": "or","conditions":["CORE 589.06"]}, "coreq": null}\n{"cid": 42570, "prereq": {"type": "and", "conditions": ["CORE 596", {"type": "admission", "conditions": "Admission to the BCR-C distance program."}]}, "antireq": null, "coreq": null}\n{"cid": 32134, "prereq": {"type": "or", "conditions": [{"type": "admission", "conditions": "Admission to the Community Health Sciences"}, {"type": "admission", "conditions": "Admission to the Public Health and Preventative Medicine program"}, {"type": "admission", "conditions": "Admission to the Mathematics and Statistics graduate program (Biostatistics specialization)"}]}, "antireq": {"type": "or","conditions":["MDSC 644"]}, "coreq": null}\n

In [13]:
messages.data

[ThreadMessage(id='msg_3ULGCtqvtxCrBBKcKpH3f1hg', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": 39871, "prereq": {"type": "admission", "conditions": "Admission to the BCR-C distance program."}, "antireq": {"type": "or","conditions":["CORE 589.06"]}, "coreq": null}\n{"cid": 42570, "prereq": {"type": "and", "conditions": ["CORE 596", {"type": "admission", "conditions": "Admission to the BCR-C distance program."}]}, "antireq": null, "coreq": null}\n{"cid": 32134, "prereq": {"type": "or", "conditions": [{"type": "admission", "conditions": "Admission to the Community Health Sciences"}, {"type": "admission", "conditions": "Admission to the Public Health and Preventative Medicine program"}, {"type": "admission", "conditions": "Admission to the Mathematics and Statistics graduate program (Biostatistics specialization)"}]}, "antireq": {"type": "or","conditions":["MDSC 644"]}, "coreq": null}\n{"cid": 32137, "prereq": {"type": "

In [18]:
response_text = messages.data[0].content[0].text.value
print(response_text)

{"cid": 39871, "prereq": {"type": "admission", "conditions": "Admission to the BCR-C distance program."}, "antireq": {"type": "or","conditions":["CORE 589.06"]}, "coreq": null}
{"cid": 42570, "prereq": {"type": "and", "conditions": ["CORE 596", {"type": "admission", "conditions": "Admission to the BCR-C distance program."}]}, "antireq": null, "coreq": null}
{"cid": 32134, "prereq": {"type": "or", "conditions": [{"type": "admission", "conditions": "Admission to the Community Health Sciences"}, {"type": "admission", "conditions": "Admission to the Public Health and Preventative Medicine program"}, {"type": "admission", "conditions": "Admission to the Mathematics and Statistics graduate program (Biostatistics specialization)"}]}, "antireq": {"type": "or","conditions":["MDSC 644"]}, "coreq": null}
{"cid": 32137, "prereq": {"type": "or", "conditions": [ {"type": "admission", "conditions": "Admission to the Master of Community Medicine specialization"}, {"type": "admission", "conditions": "A

In [21]:
# Split into list of dicts
response_dicts = response_text.split("\n")

# Convert to list of dicts
response_dicts = list(map(lambda x: json.loads(x), response_dicts))

response_dicts

[{'cid': 39871,
  'prereq': {'type': 'admission',
   'conditions': 'Admission to the BCR-C distance program.'},
  'antireq': {'type': 'or', 'conditions': ['CORE 589.06']},
  'coreq': None},
 {'cid': 42570,
  'prereq': {'type': 'and',
   'conditions': ['CORE 596',
    {'type': 'admission',
     'conditions': 'Admission to the BCR-C distance program.'}]},
  'antireq': None,
  'coreq': None},
 {'cid': 32134,
  'prereq': {'type': 'or',
   'conditions': [{'type': 'admission',
     'conditions': 'Admission to the Community Health Sciences'},
    {'type': 'admission',
     'conditions': 'Admission to the Public Health and Preventative Medicine program'},
    {'type': 'admission',
     'conditions': 'Admission to the Mathematics and Statistics graduate program (Biostatistics specialization)'}]},
  'antireq': {'type': 'or', 'conditions': ['MDSC 644']},
  'coreq': None},
 {'cid': 32137,
  'prereq': {'type': 'or',
   'conditions': [{'type': 'admission',
     'conditions': 'Admission to the Master