In [75]:
import os
import json
import jsonlines
from time import sleep
from openai import Client
from dotenv import load_dotenv

In [76]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
OPENAI_ASSISTANT_ID = os.environ["OPENAI_ASSISTANT_ID"] = os.getenv("OPENAI_ASSISTANT_ID")

In [77]:
with open('instructions.txt', 'r') as f:
    instructions = f.read()

In [78]:
client = Client(api_key=OPENAI_API_KEY)

In [79]:
assistant = client.beta.assistants.retrieve(
    assistant_id=OPENAI_ASSISTANT_ID
)
assistant.instructions = instructions

assistant

Assistant(id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', created_at=1702528907, description=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given text. Course code is always in the format of "ABCD".\n\nYou can use logical expressions "and" and "or" to help. For example:\n```json\n{\n    "type": "and",\n    "conditions": [\n        "Law 400",\n        {\n            "type": "or",\n            "conditions": [\n    

## Conversion

In [80]:
with jsonlines.open("data/course-info.jsonl") as reader:
    course_info = list(reader)


In [81]:
thread = client.beta.threads.create()
thread

Thread(id='thread_co21KFdFHdELvZmK8kNkQHws', created_at=1704002596, metadata={}, object='thread')

In [82]:
# Prepare content

courses_list = []
courses_list = filter(lambda x: x["code"] == "CPSC", course_info)
courses_list = map(lambda x: {
    "cid": x["cid"],
    "prereq": x["prereq"],
    "antireq": x["antireq"],
    "coreq": x["coreq"],
}, courses_list)
courses_list = list(map(lambda x: json.dumps(x), courses_list))
courses_list = courses_list[:10]

content = "\n".join(map(lambda x: str(x), courses_list))
print(content)

{"cid": "107154", "prereq": null, "antireq": "Not open for registration to Computer Science majors.", "coreq": null}
{"cid": "160416", "prereq": null, "antireq": "Credit for CPSC 217 and any of 215, 231, 235, DATA 211, ENCM 339, ENGG 233 or ENDG 233 will not be allowed.", "coreq": null}
{"cid": "160417", "prereq": "CPSC 217 or DATA 211.", "antireq": "Credit for CPSC 219 and any of 233, 235, ENEL 497 or ENCM 493 will not be allowed.", "coreq": null}
{"cid": "107164", "prereq": "Admission to Computer Science, Bioinformatics, or Natural Science with a primary concentration in Computer Science.", "antireq": "Credit for CPSC 231 and any of CPSC 215, 217, 235, DATA 211, ENCM 339, ENGG 233, or ENDG 233 will not be allowed.", "coreq": null}
{"cid": "107165", "prereq": "CPSC 231 and admission to Computer Science, Bioinformatics, or Natural Science with a primary concentration in Computer Science.", "antireq": "Credit for CPSC 233 and any of 219, 235, ENEL 497 or ENCM 493 will not be allowed.", 

In [83]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=content,
)

message

ThreadMessage(id='msg_KWVyCixKV9QOT6vsFQrGpXDp', assistant_id=None, content=[MessageContentText(text=Text(annotations=[], value='{"cid": "107154", "prereq": null, "antireq": "Not open for registration to Computer Science majors.", "coreq": null}\n{"cid": "160416", "prereq": null, "antireq": "Credit for CPSC 217 and any of 215, 231, 235, DATA 211, ENCM 339, ENGG 233 or ENDG 233 will not be allowed.", "coreq": null}\n{"cid": "160417", "prereq": "CPSC 217 or DATA 211.", "antireq": "Credit for CPSC 219 and any of 233, 235, ENEL 497 or ENCM 493 will not be allowed.", "coreq": null}\n{"cid": "107164", "prereq": "Admission to Computer Science, Bioinformatics, or Natural Science with a primary concentration in Computer Science.", "antireq": "Credit for CPSC 231 and any of CPSC 215, 217, 235, DATA 211, ENCM 339, ENGG 233, or ENDG 233 will not be allowed.", "coreq": null}\n{"cid": "107165", "prereq": "CPSC 231 and admission to Computer Science, Bioinformatics, or Natural Science with a primary c

In [84]:
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions=instructions,
)

run

Run(id='run_ySKWHqRSY2eRWatGJUvupAdz', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=None, created_at=1704002597, expires_at=1704003197, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given text. Course code is always in the format of "ABCD".\n\nYou can use logical expressions "and" and "or" to help. For example:\n```json\n{\n    "type": "and",\n    "condition

### Result

In [85]:
is_complete = False

# Wait until complete
while not is_complete:
    run = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id,
    )

    is_complete = run.status == "completed"

    if not is_complete:
        print("Waiting for completion...")
        sleep(10)

run

Waiting for completion...
Waiting for completion...
Waiting for completion...


Run(id='run_ySKWHqRSY2eRWatGJUvupAdz', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', cancelled_at=None, completed_at=1704002624, created_at=1704002597, expires_at=None, failed_at=None, file_ids=[], instructions='You are given a piece of json for an arbitrary course, you need to understand the course context, \nand then convert "prereq", "antireq" and "coreq" fields (which are pure text) into json.\n\nIf you are given multiple lines of input jsons, you need to output multiple lines of jsons, respectively.\n\nYour response must be in only one or more json, with no markdown and can be fully parsed with Python `json.loads` method.\n\nYou only need to include "cid", "prereq", "antireq" and "coreq" fields in your output json. Presrve "null" values.\n\nCourse title and course code might be used interchangeably in the given text. Course code is always in the format of "ABCD".\n\nYou can use logical expressions "and" and "or" to help. For example:\n```json\n{\n    "type": "and",\n    "condition

In [86]:
messages = client.beta.threads.messages.list(
    thread_id=thread.id,
)

messages

SyncCursorPage[ThreadMessage](data=[ThreadMessage(id='msg_XlN2ydIHdaJOtK0ezV3ESYdF', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": "107154", "prereq": null, "antireq": "Error: Unknown expression.", "coreq": null}\n{"cid": "160416", "prereq": null, "antireq": {"type": "or", "conditions": ["Computer Science 217", {"type": "and", "conditions": ["Computer Science 215", "Computer Science 231", "Computer Science 235", "Data Science 211", "Engineering 233", "Digital Engineering 233"]}]}, "coreq": null}\n{"cid": "160417", "prereq": {"type": "or", "conditions": ["Computer Science 217", "Data Science 211"]}, "antireq": {"type": "or", "conditions": ["Computer Science 219", {"type": "and", "conditions": ["Computer Science 233", "Computer Science 235"]}]}, "coreq": null}\n{"cid": "107164", "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "BA in Computer Science"}, {"type": "d

In [87]:
messages.data

[ThreadMessage(id='msg_XlN2ydIHdaJOtK0ezV3ESYdF', assistant_id='asst_FPCQ6UhNeRHdKcptjZjW6t4V', content=[MessageContentText(text=Text(annotations=[], value='{"cid": "107154", "prereq": null, "antireq": "Error: Unknown expression.", "coreq": null}\n{"cid": "160416", "prereq": null, "antireq": {"type": "or", "conditions": ["Computer Science 217", {"type": "and", "conditions": ["Computer Science 215", "Computer Science 231", "Computer Science 235", "Data Science 211", "Engineering 233", "Digital Engineering 233"]}]}, "coreq": null}\n{"cid": "160417", "prereq": {"type": "or", "conditions": ["Computer Science 217", "Data Science 211"]}, "antireq": {"type": "or", "conditions": ["Computer Science 219", {"type": "and", "conditions": ["Computer Science 233", "Computer Science 235"]}]}, "coreq": null}\n{"cid": "107164", "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "BA in Computer Science"}, {"type": "degree", "degree": "BA in Bioinforma

In [88]:
response_text = messages.data[0].content[0].text.value
print(response_text)

{"cid": "107154", "prereq": null, "antireq": "Error: Unknown expression.", "coreq": null}
{"cid": "160416", "prereq": null, "antireq": {"type": "or", "conditions": ["Computer Science 217", {"type": "and", "conditions": ["Computer Science 215", "Computer Science 231", "Computer Science 235", "Data Science 211", "Engineering 233", "Digital Engineering 233"]}]}, "coreq": null}
{"cid": "160417", "prereq": {"type": "or", "conditions": ["Computer Science 217", "Data Science 211"]}, "antireq": {"type": "or", "conditions": ["Computer Science 219", {"type": "and", "conditions": ["Computer Science 233", "Computer Science 235"]}]}, "coreq": null}
{"cid": "107164", "prereq": {"type": "admission", "conditions": {"type": "or", "conditions": [{"type": "degree", "degree": "BA in Computer Science"}, {"type": "degree", "degree": "BA in Bioinformatics"}, {"type": "degree", "degree": "BA in Natural Science", "concentration": "Computer Science"}]}}, "antireq": {"type": "or", "conditions": ["Computer Scienc

In [89]:
# Split into list of dicts
response_dicts = response_text.split("\n")

# Convert to list of dicts
response_dicts = list(map(lambda x: json.loads(x), response_dicts))

response_dicts

[{'cid': '107154',
  'prereq': None,
  'antireq': 'Error: Unknown expression.',
  'coreq': None},
 {'cid': '160416',
  'prereq': None,
  'antireq': {'type': 'or',
   'conditions': ['Computer Science 217',
    {'type': 'and',
     'conditions': ['Computer Science 215',
      'Computer Science 231',
      'Computer Science 235',
      'Data Science 211',
      'Engineering 233',
      'Digital Engineering 233']}]},
  'coreq': None},
 {'cid': '160417',
  'prereq': {'type': 'or',
   'conditions': ['Computer Science 217', 'Data Science 211']},
  'antireq': {'type': 'or',
   'conditions': ['Computer Science 219',
    {'type': 'and',
     'conditions': ['Computer Science 233', 'Computer Science 235']}]},
  'coreq': None},
 {'cid': '107164',
  'prereq': {'type': 'admission',
   'conditions': {'type': 'or',
    'conditions': [{'type': 'degree', 'degree': 'BA in Computer Science'},
     {'type': 'degree', 'degree': 'BA in Bioinformatics'},
     {'type': 'degree',
      'degree': 'BA in Natural S