In [None]:
%load_ext dotenv
%dotenv

In [19]:
from openai import AsyncOpenAI
import os
client = AsyncOpenAI(api_key = os.environ['OPENAI_API_KEY'])

In [20]:
def get_prompt_1(data):
    return (
        "Here is some recent google search activity."
        " What is the user doing throughout the day?\n\n"
    ) + data


def get_prompt_2():
    return (
        "which of these topics can be classified as proactive intents (endogenous, proactive "
        "knowledge seeking, long term) and which as reactive intents "
        "(exogenous, reactive knowledge seeking, short term)?"
    )


def get_prompt_3():
    return (
        "Can you format the previous answer in a json object? the root object should have"
        " the fields: reactive, proactive; and the type of these fields should be an array"
        " with items with fields: title, description, time_start, time_end"
    )


async def get_clusters(raw_data):
    response = await client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": get_prompt_1(raw_data)},
        ],
    )

    return response.choices[0].message.content


async def classify_clusters(raw_data, cluster_data):
    response = await client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": get_prompt_1(raw_data)},
            {"role": "assistant", "content": cluster_data},
            {"role": "user", "content": get_prompt_2()},
        ],
    )

    return response.choices[0].message.content


async def get_json(raw_data, cluster_data, intent_data):
    response = await client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to output JSON."},            {"role": "user", "content": get_prompt_1(raw_data)},
            {"role": "assistant", "content": cluster_data},
            {"role": "user", "content": get_prompt_2()},
            {"role": "assistant", "content": intent_data},
            {"role": "user", "content": get_prompt_3()},
        ],
    )

    return response.choices[0].message.content

In [21]:
import asyncio

# get all file names in the data folder
file_names = os.listdir('raw_data')
raw_data = []

# for each file name, read the file and get the completion asynchronously
for file_name in file_names:
    with open('raw_data/' + file_name, 'r') as f:
        raw_data.append(f.read())

awaitables = []
for rd in raw_data:    
    awaitables.append(get_clusters(rd))

# wait for all completions to finish
cluster_data = await asyncio.gather(*awaitables)

for i in range(len(file_names)):
    with open('out/' + file_names[i] + '-clustered.txt', 'w') as f:
        f.write(cluster_data[i])

In [22]:
awaitables = []
for (rd, cd) in zip(raw_data, cluster_data):
    awaitables.append(classify_clusters(rd, cd))

intent_data = await asyncio.gather(*awaitables)

for i in range(len(file_names)):
    with open('out/' + file_names[i] + '-classified.txt', 'w') as f:
        f.write(intent_data[i])

In [23]:
awaitables = []
for (rd, cd, id) in zip(raw_data, cluster_data, intent_data):
    awaitables.append(get_json(rd, cd, id))

json_data = await asyncio.gather(*awaitables)

for i in range(len(file_names)):
    with open('out/' + file_names[i] + '-final.json', 'w') as f:
        f.write(json_data[i])

In [None]:
# validate each .json file against schema.json
from jsonschema import validate
import json

with open('schema.json', 'r') as f:
    schema = json.load(f)

for file_name in file_names:
    with open('out/' + file_name + '-final.json', 'r') as f:
        data = json.load(f)
        validate(instance=data, schema=schema)

In [29]:
# remove ".csv" from all filenames inside the data folder
import os

file_names = os.listdir('out')

for file_name in file_names:
    os.rename('out/' + file_name, 'out/' + file_name.replace('.csv', ''))

In [2]:
import json,os 

files = [f for f in os.listdir('out') if f.endswith('.json')]
reactive = {}
proactive = {}

for f in files:
    with open('out/' + f, 'r') as f:
        string = f.read()
        date = f.name[4:-11]
        reactive[date] = json.loads(string)["reactive"]
        proactive[date] = json.loads(string)["proactive"]

# sort the dictionary by date
reactive = dict(sorted(reactive.items()))
proactive = dict(sorted(proactive.items()))

with open('summaries/reactive.json', 'w') as f:
    json.dump(reactive, f)

with open('summaries/proactive.json', 'w') as f:
    json.dump(proactive, f)
