In [1]:
import os
import json
import jsonlines
from time import sleep
from openai import Client
from dotenv import load_dotenv

In [2]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
OPENAI_ASSISTANT_ID = os.environ["OPENAI_ASSISTANT_ID"] = os.getenv("OPENAI_ASSISTANT_ID")

In [3]:
with open('instructions.txt', 'r') as f:
    instructions = f.read()

In [4]:
client = Client(api_key=OPENAI_API_KEY)

In [5]:
assistant = client.beta.assistants.retrieve(
    assistant_id=OPENAI_ASSISTANT_ID
)
assistant.instructions = instructions

assistant

Assistant(id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', created_at=1702528907, description=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq". When it says "Credit for ENCI 300 and 337 will not be allowed.", your "antireq" field should be:\n```json\n{\n    "antireq": {\n        "type": "or",\n        "conditions": [\n            "Engineering 300",\n            "Engineering 337"\n        ]\n    }\n}\n```\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse ti

## Conversion

In [6]:
with jsonlines.open("data/course-info.jsonl") as reader:
    course_info = list(reader)


In [7]:
thread = client.beta.threads.create()
thread

Thread(id='thread_zgZn7J6GnS1O7ZIldzhMQRW5', created_at=1703958292, metadata={}, object='thread')

In [8]:
# Prepare content

courses_list = []
courses_list = filter(lambda x: x["code"] == "CPSC", course_info)
courses_list = map(lambda x: {
    "cid": x["cid"],
    "prereq": x["prereq"],
    "antireq": x["antireq"],
    "coreq": x["coreq"],
}, courses_list)
courses_list = map(lambda x: json.dumps(x), courses_list)

content = "\n".join(map(lambda x: str(x), courses_list))
print(content)

{"cid": 3618, "prereq": null, "antireq": "Not open for registration to Computer Science majors.", "coreq": null}
{"cid": 3619, "prereq": null, "antireq": "Credit for <course cid=\"3619\">CPSC 217</course> and any of 215, <course cid=\"3621\">231</course>, <course cid=\"3623\">235</course>, <course cid=\"43036\">DATA 211</course>, ENCM 339, ENGG 233 or <course cid=\"47158\">ENDG 233</course> will not be allowed.", "coreq": null}
{"cid": 3620, "prereq": "<course cid=\"3619\">CPSC 217</course> or <course cid=\"43036\">DATA 211</course>.", "antireq": "Credit for <course cid=\"3620\">CPSC 219</course> and any of <course cid=\"3622\">233</course>, <course cid=\"3623\">235</course>, ENEL 497 or ENCM 493 will not be allowed.", "coreq": null}
{"cid": 3621, "prereq": "Admission to Computer Science, Bioinformatics, or Natural Science with a primary concentration in Computer Science.", "antireq": "Credit for <course cid=\"3621\">CPSC 231</course> and any of CPSC 215, <course cid=\"3619\">217</cour

In [9]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=content,
)

message

ThreadMessage(id='msg_UEFaam4dbER8ZiA4BmvUpc6U', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='{"cid": 3618, "prereq": null, "antireq": "Not open for registration to Computer Science majors.", "coreq": null}\n{"cid": 3619, "prereq": null, "antireq": "Credit for <course cid=\\"3619\\">CPSC 217</course> and any of 215, <course cid=\\"3621\\">231</course>, <course cid=\\"3623\\">235</course>, <course cid=\\"43036\\">DATA 211</course>, ENCM 339, ENGG 233 or <course cid=\\"47158\\">ENDG 233</course> will not be allowed.", "coreq": null}\n{"cid": 3620, "prereq": "<course cid=\\"3619\\">CPSC 217</course> or <course cid=\\"43036\\">DATA 211</course>.", "antireq": "Credit for <course cid=\\"3620\\">CPSC 219</course> and any of <course cid=\\"3622\\">233</course>, <course cid=\\"3623\\">235</course>, ENEL 497 or ENCM 493 will not be allowed.", "coreq": null}\n{"cid": 3621, "prereq": "Admission to Computer Science, Bioinformatics, or Natural Science with a primar

In [10]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions=instructions,
)

run

Run(id='run_nJPPdnYY9NJMsQVWsbUVOlmR', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=None, created_at=1703958293, expires_at=1703958893, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq". When it says "Credit for ENCI 300 and 337 will not be allowed.", your "antireq" field should be:\n```json\n{\n    "antireq": {\n        "type": "or",\n        "conditions": [\n            "Engineering 300",\n            "Engineering 337"\n        ]\n    }\n}\n```\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid

### Result

In [11]:
is_complete = False

# Wait until complete
while not is_complete:
    run = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id,
    )

    is_complete = run.status == "completed"

    if not is_complete:
        print("Waiting for completion...")
        sleep(10)

run

Waiting for completion...
Waiting for completion...
Waiting for completion...
Waiting for completion...
Waiting for completion...
Waiting for completion...
Waiting for completion...
Waiting for completion...


Run(id='run_nJPPdnYY9NJMsQVWsbUVOlmR', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=1703958366, created_at=1703958293, expires_at=None, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nFor "antireq", the logic is the opposite of "prereq" and "coreq". When it says "Credit for ENCI 300 and 337 will not be allowed.", your "antireq" field should be:\n```json\n{\n    "antireq": {\n        "type": "or",\n        "conditions": [\n            "Engineering 300",\n            "Engineering 337"\n        ]\n    }\n}\n```\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid

In [12]:
messages = client.beta.threads.messages.list(
    thread_id=thread.id,
)

messages

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_y3kl0du963sXNqSiyKnCJ1La', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": 3618, "prereq": null, "antireq": {"type": "exclude_courses", "courses": ["Computer Science"]}, "coreq": null}\n{"cid": 3619, "prereq": null, "antireq": {"type": "or", "conditions": ["CPSC 217", "CPSC 215", "CPSC 231", "CPSC 235", "DATA 211", "ENCM 339", "ENGG 233", "ENDG 233"]}, "coreq": null}\n{"cid": 3620, "prereq": {"type": "or", "conditions": ["CPSC 217", "DATA 211"]}, "antireq": {"type": "or",  "conditions": ["CPSC 219", "CPSC 233", "CPSC 235", "ENEL 497", "ENCM 493"]}, "coreq": null}\n{"cid": 3621, "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "Computer Science"}, {"type": "degree", "degree": "Bioinformatics"}, {"type": "degree", "degree": "Natural Science with a primary concentration in Computer Science"}]}}, "antireq": {"ty

In [13]:
messages.data

[ThreadMessage(id='msg_y3kl0du963sXNqSiyKnCJ1La', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": 3618, "prereq": null, "antireq": {"type": "exclude_courses", "courses": ["Computer Science"]}, "coreq": null}\n{"cid": 3619, "prereq": null, "antireq": {"type": "or", "conditions": ["CPSC 217", "CPSC 215", "CPSC 231", "CPSC 235", "DATA 211", "ENCM 339", "ENGG 233", "ENDG 233"]}, "coreq": null}\n{"cid": 3620, "prereq": {"type": "or", "conditions": ["CPSC 217", "DATA 211"]}, "antireq": {"type": "or",  "conditions": ["CPSC 219", "CPSC 233", "CPSC 235", "ENEL 497", "ENCM 493"]}, "coreq": null}\n{"cid": 3621, "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "Computer Science"}, {"type": "degree", "degree": "Bioinformatics"}, {"type": "degree", "degree": "Natural Science with a primary concentration in Computer Science"}]}}, "antireq": {"type": "or", "conditions": ["CPSC 215

In [14]:
response_text = messages.data[0].content[0].text.value
print(response_text)

{"cid": 3618, "prereq": null, "antireq": {"type": "exclude_courses", "courses": ["Computer Science"]}, "coreq": null}
{"cid": 3619, "prereq": null, "antireq": {"type": "or", "conditions": ["CPSC 217", "CPSC 215", "CPSC 231", "CPSC 235", "DATA 211", "ENCM 339", "ENGG 233", "ENDG 233"]}, "coreq": null}
{"cid": 3620, "prereq": {"type": "or", "conditions": ["CPSC 217", "DATA 211"]}, "antireq": {"type": "or",  "conditions": ["CPSC 219", "CPSC 233", "CPSC 235", "ENEL 497", "ENCM 493"]}, "coreq": null}
{"cid": 3621, "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "Computer Science"}, {"type": "degree", "degree": "Bioinformatics"}, {"type": "degree", "degree": "Natural Science with a primary concentration in Computer Science"}]}}, "antireq": {"type": "or", "conditions": ["CPSC 215", "CPSC 217", "CPSC 235", "DATA 211", "ENCM 339", "ENGG 233", "ENDG 233"]}, "coreq": null}
{"cid": 3622, "prereq": {"type": "and", "conditions": ["CPSC 231", {

In [15]:
# Split into list of dicts
response_dicts = response_text.split("\n")

# Convert to list of dicts
response_dicts = list(map(lambda x: json.loads(x), response_dicts))

response_dicts

[{'cid': 3618,
  'prereq': None,
  'antireq': {'type': 'exclude_courses', 'courses': ['Computer Science']},
  'coreq': None},
 {'cid': 3619,
  'prereq': None,
  'antireq': {'type': 'or',
   'conditions': ['CPSC 217',
    'CPSC 215',
    'CPSC 231',
    'CPSC 235',
    'DATA 211',
    'ENCM 339',
    'ENGG 233',
    'ENDG 233']},
  'coreq': None},
 {'cid': 3620,
  'prereq': {'type': 'or', 'conditions': ['CPSC 217', 'DATA 211']},
  'antireq': {'type': 'or',
   'conditions': ['CPSC 219', 'CPSC 233', 'CPSC 235', 'ENEL 497', 'ENCM 493']},
  'coreq': None},
 {'cid': 3621,
  'prereq': {'type': 'admission',
   'conditions': {'type': 'or',
    'conditions': [{'type': 'degree', 'degree': 'Computer Science'},
     {'type': 'degree', 'degree': 'Bioinformatics'},
     {'type': 'degree',
      'degree': 'Natural Science with a primary concentration in Computer Science'}]}},
  'antireq': {'type': 'or',
   'conditions': ['CPSC 215',
    'CPSC 217',
    'CPSC 235',
    'DATA 211',
    'ENCM 339',
    '