In [1]:
!pip install openpipe==3.3.1



In [2]:
import polars as pl

hn = pl.read_parquet("data/hn.parquet")
stories = pl.read_parquet("data/stories-classified.parquet")

comments = hn.filter(
    (pl.col("type") == "comment")
    & pl.col("deleted").is_null()
    & pl.col("dead").is_null()
)

In [3]:
tag_cols = ["ai_ml", "crypto", "remote_work", "rust"]
num_stories = stories.describe().filter(pl.col("describe") == "mean")[tag_cols] * len(
    stories
)
num_stories

ai_ml,crypto,remote_work,rust
f64,f64,f64,f64
16593.0,8438.0,3132.0,4217.0


In [4]:
def applied_tag(tag_name):
    return (
        pl.when(
            pl.col("top_level_parent").is_in(stories.filter(pl.col(tag_name))["id"])
        )
        .then(True)
        .otherwise(False)
        .alias(tag_name)
    )


comments = comments.with_columns([applied_tag(tag) for tag in tag_cols])

num_comments = comments.describe().filter(pl.col("describe") == "mean")[tag_cols] * len(
    comments
)
num_comments

ai_ml,crypto,remote_work,rust
f64,f64,f64,f64
1277261.0,817174.0,379192.0,369599.0


In [5]:
relevant_comments = comments.filter(
    pl.col("ai_ml") | pl.col("crypto") | pl.col("remote_work") | pl.col("rust")
)

relevant_comments["text"].str.len_chars().describe()

statistic,value
str,f64
"""count""",2817047.0
"""null_count""",21283.0
"""mean""",392.896374
"""std""",449.83422
"""min""",0.0
"""25%""",131.0
"""50%""",260.0
"""75%""",489.0
"""max""",19371.0


In [6]:
id_to_story_title = (
    stories.filter(
        pl.col("id").is_in(relevant_comments["top_level_parent"])
        # pl.id("ai_ml") | pl.col("crypto") | pl.col("remote_work") | pl.col("rust")
    )[["id", "title"]]
    .to_pandas()
    .set_index("id", drop=True)
)["title"].to_dict()

id_to_text = (
    relevant_comments[["id", "text"]]
    .to_pandas()
    .set_index("id", drop=True)["text"]
    .to_dict()
)

In [7]:
relevant_comments = relevant_comments.with_columns(
    [
        pl.col("top_level_parent").map_dict(id_to_story_title).alias("story_title"),
        pl.col("parent")
        .cast(pl.Int64)
        .map_dict(id_to_text, return_dtype=str)
        .alias("parent_comment_text"),
    ]
)

relevant_comments

id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead,ai_ml,crypto,remote_work,rust,story_title,parent_comment_text
i64,str,str,datetime[μs],str,str,str,f64,f64,i64,f64,list[i64],bool,bool,bool,bool,bool,bool,str,str
1026300,"""comment""","""lrm242""",2010-01-01 21:49:27,,"""This is a grea…",,,1.026228e6,1026228,,"[1026331, 1026782]",,,true,false,false,false,"""Beyond PageRan…",
1026331,"""comment""","""bradfordcross""",2010-01-01 22:09:10,,"""Thanks! Likewi…",,,1.0263e6,1026228,,[1026389],,,true,false,false,false,"""Beyond PageRan…","""This is a grea…"
1026389,"""comment""","""lrm242""",2010-01-01 22:49:18,,"""Indeed. From t…",,,1.026331e6,1026228,,,,,true,false,false,false,"""Beyond PageRan…","""Thanks! Likewi…"
1026416,"""comment""","""felicisvc""",2010-01-01 23:06:55,,"""Thanks for thi…",,,1.026228e6,1026228,,,,,true,false,false,false,"""Beyond PageRan…",
1026428,"""comment""","""ramanujan""",2010-01-01 23:15:02,,"""bradfordcross:…",,,1.026228e6,1026228,,[1026543],,,true,false,false,false,"""Beyond PageRan…",
1026543,"""comment""","""bradfordcross""",2010-01-02 01:21:21,,"""This is the sa…",,,1.026428e6,1026228,,[1026733],,,true,false,false,false,"""Beyond PageRan…","""bradfordcross:…"
1026562,"""comment""","""jonmc12""",2010-01-02 01:41:23,,"""""I want to swi…",,,1.026228e6,1026228,,[1026739],,,true,false,false,false,"""Beyond PageRan…",
1026733,"""comment""","""yannis""",2010-01-02 03:53:37,,"""It is a good a…",,,1.026543e6,1026228,,,,,true,false,false,false,"""Beyond PageRan…","""This is the sa…"
1026739,"""comment""","""sdrinf""",2010-01-02 03:58:53,,"""Um, no.<p>The …",,,1.026562e6,1026228,,[1026793],,,true,false,false,false,"""Beyond PageRan…","""""I want to swi…"
1026782,"""comment""","""johnl""",2010-01-02 04:39:59,,"""Yep, good plac…",,,1.0263e6,1026228,,,,,true,false,false,false,"""Beyond PageRan…","""This is a grea…"


In [8]:
def get_completion_inputs(row, topic):
    expanded_topic = {
        "ai_ml": "AI and ML",
        "crypto": "blockchain/crypto",
        "remote_work": "remote work",
        "rust": "Rustlang",
    }[topic]

    parent_text = (
        f"PARENT COMMENT:\n{row['parent_comment_text']}"
        if row["parent_comment_text"]
        else f"PARENT STORY:\n{row['story_title']}"
    )

    messages = [
        {
            "role": "system",
            "content": f"You will be given an HN child comment and its parent. Do your best to determine the sentiment of the CHILD COMMENT towards <<{expanded_topic}>>.\n\nIf you are unsure or the CHILD COMMENT doesn't express an opinion on {expanded_topic} assume 'neutral' by default.",
        },
        {
            "role": "user",
            "content": f"{parent_text}\n---\nCHILD COMMENT:\n{row['text']}",
        },
    ]
    functions = [
        {
            "name": "classify",
            "parameters": {
                "type": "object",
                "properties": {
                    "sentiment": {
                        "type": "string",
                        "enum": ["positive", "neutral", "negative"],
                    },
                },
                "required": ["sentiment"],
            },
        }
    ]
    function_call = {"name": "classify"}

    return {
        "messages": messages,
        "functions": functions,
        "function_call": function_call,
    }

In [9]:
from shared import serialize_input
import os

comments_to_classify = None

path = "data/comments-to-classify.ndjson"

if os.path.exists(path):
    comments_to_classify = pl.read_ndjson(path)
else:
    for tag in tag_cols:
        print(f"Processing {tag}")
        tag_comments = (
            relevant_comments.filter(pl.col(tag)).with_columns(
                pl.struct(pl.all())
                .map_elements(lambda row: get_completion_inputs(row, tag))
                .alias("input"),
                pl.lit(tag).alias("tag"),
            )
        )[["id", "tag", "input"]]
        if comments_to_classify is None:
            comments_to_classify = tag_comments
        else:
            comments_to_classify = comments_to_classify.vstack(tag_comments)

    comments_to_classify.write_ndjson(path)
comments_to_classify.head(1)

Processing ai_ml


Processing crypto
Processing remote_work
Processing rust


id,tag,input
i64,str,struct[3]
1026300,"""ai_ml""","{[{""system"",""You will be given an HN child comment and its parent. Do your best to determine the sentiment of the CHILD COMMENT towards <<AI and ML>>. If you are unsure or the CHILD COMMENT doesn't express an opinion on AI and ML assume 'neutral' by default.""}, {""user"",""PARENT STORY: Beyond PageRank: Learning with Content and Networks --- CHILD COMMENT: This is a great article. I posted on the same topic but different perspective today as well: <a href=""http://fitnr.com/filtering-the-web-of-noise/"" rel=""nofollow"">http://fitnr.com/filtering-the-web-of-noise/</a>""}],[{""classify"",{""object"",{{""string"",[""positive"", ""neutral"", ""negative""]}},[""sentiment""]}}],{""classify""}}"


In [10]:
training_comments = comments_to_classify.sample(20000, seed=42)

In [11]:
from joblib import Memory
import openpipe
import os
from dotenv import load_dotenv
import json

load_dotenv()

openpipe.configure_openpipe(api_key=os.getenv("OPENPIPE_API_KEY"))

openpipe.openai.api_key = os.getenv("OPENAI_API_KEY")

memory = Memory("/workspace/cache/hn_comment_sentiment_analysis", verbose=0)


@memory.cache
def classify_comment(row):
    resp = openpipe.openai.ChatCompletion.create(
        model="gpt-4",
        **row["input"],
        openpipe={"tags": {"prompt_id": "classify_hn_comment_v10"}},
    )

    json.loads(resp.choices[0].message.function_call.arguments)
    return resp


classify_comment(training_comments[0].to_dicts()[0])

# print(training_comments[0].to_dicts()[0])

[2023-11-03 00:35:49] HTTP Request: POST https://app.openpipe.ai/api/v1/report "HTTP/1.1 200 OK"


<OpenAIObject chat.completion id=chatcmpl-8GcS2LCBLDNMkytaJmC4XOuheiLUM at 0x7f242a9f80e0> JSON: {
  "id": "chatcmpl-8GcS2LCBLDNMkytaJmC4XOuheiLUM",
  "object": "chat.completion",
  "created": 1698971746,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": null,
        "function_call": {
          "name": "classify",
          "arguments": "{\n\"sentiment\": \"positive\"\n}"
        }
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 236,
    "completion_tokens": 9,
    "total_tokens": 245
  },
  "openpipe": {
    "cache_status": "SKIP"
  }
}

In [12]:
import tqdm
import json
from joblib import Parallel, delayed


training_data = []


def process_input(row):
    for tag in tag_cols:
        row[f"sentiment_{tag}"] = None
        if row[tag]:
            try:
                output = classify_comment(row, tag)
                sentiment = json.loads(
                    output.choices[0].message["function_call"]["arguments"]
                )["sentiment"]
                row[f"sentiment_{tag}"] = sentiment
            except Exception as e:
                print(e)
    return row


gpt4_labeled_comments = Parallel(n_jobs=20)(
    delayed(process_input)(row)
    for row in tqdm.tqdm(training_comments.head(20000).rows(named=True))
)
gpt4_labeled_comments = pl.DataFrame(gpt4_labeled_comments)
gpt4_labeled_comments

  0%|          | 1/20000 [00:00<1:26:37,  3.85it/s]Exception ignored in: <function _releaseLock at 0x7f44648b81f0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
  0%|          | 20/20000 [00:05<1:28:52,  3.75it/s]

KeyError: 'ai_ml'

In [None]:
gpt4_labeled_comments.write_parquet("data/labeled-comments.parquet")