In [8]:
import os
import json
import openai
import time
from tqdm import tqdm
import polars as pl
from dotenv import dotenv_values

config = dotenv_values("../.env") 

In [18]:
def get_issue_reponses(title, body):
    cost = 0
    response = openai.ChatCompletion.create(
        engine=config["OPENAI_ENGINE"], 
        messages=[
            {"role": "system", "content": "You are a helpful assistant that is python master and have years of experience with github moderation."},
            {"role": "user", "content": f"Is the description of issue good? \ntitle: {title}\nbody: {body}\nAnswer with Yes/No as single word"},
        ]
    )
    description_quality = response['choices'][0]['message']['content']
    cost += response['usage']["total_tokens"]

    response = openai.ChatCompletion.create(
        engine=config["OPENAI_ENGINE"], 
        messages=[
            {"role": "system", "content": "You are a helpful assistant that is python master and have years of experience with github moderation."},
            {"role": "user", "content": f"Give me one sentence summary of following issue, use ten words or less: \ntitle: {title}\nbody: {body}"},
        ]
    )
    summary = response['choices'][0]['message']['content']
    cost += response['usage']["total_tokens"]

    response = openai.ChatCompletion.create(
        engine=config["OPENAI_ENGINE"], 
        messages=[
            {"role": "system", "content": "You are a helpful assistant that is python master and have years of experience with github moderation. You are maintainer of shap package"},
            {"role": "user", "content": f"Suggest short label for this issue. Label should be of maximum two words: \ntitle: {title}\nbody: {body}"},
        ]
    )
    short_label = response['choices'][0]['message']['content']
    cost += response['usage']["total_tokens"]

    issue_labels = [
        "bug",
        "documentation",
        "enhancement",
        "good first issue",
        "stale"
    ]
    response = openai.ChatCompletion.create(
        engine=config["OPENAI_ENGINE"], 
        messages=[
            {"role": "system", "content": "You are a helpful assistant that is python master and have years of experience with github moderation. You are maintainer of shap package"},
            {"role": "user", "content": f"Select label from {issue_labels}, where stale label is for issues that are too short and not clear: \ntitle: {title}\nbody: {body}\nAnswer with label name from list and nothing else"},
        ]
    )
    label = response['choices'][0]['message']['content']
    cost += response['usage']["total_tokens"]
    # response = openai.ChatCompletion.create(
    #     engine=config["OPENAI_ENGINE"], 
    #     messages=[
    #         {"role": "system", "content": "You are a helpful assistant that is python master and have years of experience with github moderation. You are maintainer of shap package"},
    #         {"role": "user", "content": f"Write first comment for: \ntitle: {title}\nbody: {body}\n"},
    #     ]
    # )
    # comment = response['choices'][0]['message']['content']

    return description_quality, summary, short_label, label, cost

In [21]:
openai.api_type = "azure"
openai.api_base = config["OPENAI_ENDPOINT"]
openai.api_version = "2023-05-15"
openai.api_key = config["OPENAI_KEY"]

data = pl.read_json("../data/issues_v1.json").filter(pl.col("number").is_not_null()).sort("number")
data = data.with_columns([
    pl.lit("").alias("description_quality"),
    pl.lit("").alias("summary"),
    pl.lit("").alias("short_label"),
    pl.lit("").alias("label"),
])

In [22]:
total_tokens = 0

In [45]:

for i in tqdm(range(len(data))):
    if data[i, "summary"] != "":
        continue
    row = data[i]
    title = row["title"]
    body = row["body"]
    passed = False
    while not passed:
        try:
            description_quality, summary, short_label, label, tokens = get_issue_reponses(title, body)
            data[i, "description_quality"] = description_quality
            data[i, "summary"] = summary
            data[i, "short_label"] = short_label
            data[i, "label"] = label
            total_tokens += tokens
            passed = True
        except (openai.error.RateLimitError, openai.error.APIConnectionError):
            time.sleep(30)
    # print(f"Total tokens: {total_tokens}")
    # print(f"Average tokens per issue: {total_tokens / (i + 1)}")
    print(f"Estimated cost: {total_tokens / 1000 * 0.028}")

 50%|█████     | 1492/2955 [00:02<00:02, 633.54it/s]

Estimated cost: 18.288619999999998
Estimated cost: 18.301332000000002


 51%|█████     | 1494/2955 [00:38<00:52, 27.88it/s] 

Estimated cost: 18.313708000000002


 51%|█████     | 1495/2955 [00:40<00:56, 25.73it/s]

Estimated cost: 18.326168


 51%|█████     | 1496/2955 [01:13<02:30,  9.71it/s]

Estimated cost: 18.338152


 51%|█████     | 1497/2955 [01:15<02:37,  9.26it/s]

Estimated cost: 18.350696


 51%|█████     | 1498/2955 [01:49<05:51,  4.15it/s]

Estimated cost: 18.363016000000002


 51%|█████     | 1499/2955 [01:51<06:05,  3.99it/s]

Estimated cost: 18.375392


 51%|█████     | 1500/2955 [02:24<12:28,  1.94it/s]

Estimated cost: 18.387544


 51%|█████     | 1501/2955 [02:26<12:55,  1.87it/s]

Estimated cost: 18.39978


 51%|█████     | 1502/2955 [03:01<25:48,  1.07s/it]

Estimated cost: 18.41196


 51%|█████     | 1503/2955 [03:03<26:22,  1.09s/it]

Estimated cost: 18.424056


 51%|█████     | 1504/2955 [03:36<50:09,  2.07s/it]

Estimated cost: 18.43618


 51%|█████     | 1505/2955 [03:39<50:16,  2.08s/it]

Estimated cost: 18.449088


 51%|█████     | 1506/2955 [04:12<1:33:29,  3.87s/it]

Estimated cost: 18.461184


 51%|█████     | 1507/2955 [04:15<1:30:30,  3.75s/it]

Estimated cost: 18.47342


 51%|█████     | 1508/2955 [04:48<2:39:34,  6.62s/it]

Estimated cost: 18.485824


 51%|█████     | 1509/2955 [04:51<2:27:09,  6.11s/it]

Estimated cost: 18.49806


 51%|█████     | 1510/2955 [05:24<4:04:43, 10.16s/it]

Estimated cost: 18.510268


 51%|█████     | 1511/2955 [05:27<3:33:18,  8.86s/it]

Estimated cost: 18.522336


 51%|█████     | 1512/2955 [06:01<5:34:41, 13.92s/it]

Estimated cost: 18.534460000000003


 51%|█████     | 1513/2955 [06:04<4:32:44, 11.35s/it]

Estimated cost: 18.546556


 51%|█████     | 1514/2955 [06:37<6:41:05, 16.70s/it]

Estimated cost: 18.558960000000003


 51%|█████▏    | 1515/2955 [06:40<5:12:35, 13.02s/it]

Estimated cost: 18.571168


 51%|█████▏    | 1516/2955 [07:13<7:22:57, 18.47s/it]

Estimated cost: 18.583712


 51%|█████▏    | 1517/2955 [07:15<5:36:21, 14.03s/it]

Estimated cost: 18.596172


 51%|█████▏    | 1518/2955 [07:52<8:10:13, 20.47s/it]

Estimated cost: 18.608352


 51%|█████▏    | 1519/2955 [07:54<6:03:09, 15.17s/it]

Estimated cost: 18.620672


 51%|█████▏    | 1520/2955 [08:28<8:10:41, 20.52s/it]

Estimated cost: 18.63288


 51%|█████▏    | 1521/2955 [08:30<6:01:37, 15.13s/it]

Estimated cost: 18.645256000000003


 52%|█████▏    | 1522/2955 [09:04<8:16:16, 20.78s/it]

Estimated cost: 18.65738


 52%|█████▏    | 1523/2955 [09:06<6:02:50, 15.20s/it]

Estimated cost: 18.66914


 52%|█████▏    | 1523/2955 [09:38<09:03,  2.63it/s]  


APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [44]:
data.write_json("../data/issues_v1_gpt.json", row_oriented=True, pretty=True)