In [48]:
import requests as r
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd

'''
All HN API requests are constructed using the following reference: https://github.com/HackerNews/API
'''

base_url = "https://hacker-news.firebaseio.com/v0/"
human_display_item_url = "https://news.ycombinator.com/item?id="
new_stories_slug = "newstories"
item_slug = "item/"
json_append_slug = ".json"

In [49]:
### Get Story IDs for the newest stories from the HN New Stories endpoint
new_stories_response = r.get(base_url + new_stories_slug + json_append_slug)
new_stories_json = new_stories_response.json()

In [50]:
# returns datetime.timedelta object representing time between execution and story posting
def get_td_from_story(story):
    story_unix_time = story['time']
    story_datetime = datetime.fromtimestamp(story_unix_time)
    timedelta_since_post = datetime.now() - story_datetime
    return timedelta_since_post

def get_comments_per_minute_from_story(timedelta_since_post, num_comments):
    minutes_since_post = timedelta_since_post.seconds / 60
    comments_per_minute = (num_comments * 1.0) / minutes_since_post
    return comments_per_minute

def format_timedelta(timedelta_since_post):
    hours_since_post, remainder = divmod(timedelta_since_post.seconds, 3600)
    minutes_since_post, seconds = divmod(remainder, 60)
    time_since_post = str(hours_since_post) + 'h' + str(minutes_since_post) + 'm'
    return time_since_post

def get_comment_timestamps(story_json, delay_in_seconds):
    comments = story_json['kids']
    comment_timestamps = []
    for comment_id in comments:
        comment_response = r.get(base_url + item_slug + str(comment_id) + json_append_slug)
        comment = comment_response.json()
        comment_timestamps.append(comment['time'])
        time.sleep(delay_in_seconds)
    return comment_timestamps

def get_item_timestamps_for_all_kids(item_json, delay_in_seconds):
    if 'kids' not in item_json:
        return []
    kids = item_json['kids']
    print("Pulling timestamp info of kids for item with " + str(len(kids)) + " kids")
    kid_timestamps = []
    for kid_id in kids:
        kid_response = r.get(base_url + item_slug + str(kid_id) + json_append_slug)
        kid_json = kid_response.json()
        kid_timestamps.append(kid_json['time'])
        kid_timestamps = kid_timestamps + get_item_timestamps_for_all_kids(kid_json, delay_in_seconds)
        time.sleep(delay_in_seconds)
    return kid_timestamps

# Structure: story_id: {num_comments, }
story_info_tracker = {}
num_stories = len(new_stories_json)
comment_timeseries_records = []
for i, story_id in enumerate(new_stories_json):
    story_response = r.get(base_url + item_slug + str(story_id) + json_append_slug)
    story = story_response.json()
    if 'descendants' in story:
        num_comments = story['descendants']
    elif ('dead' in story and story['dead']):
        continue
    elif ('deleted' in story and story['deleted']):
        continue
    else:
        raise Exception("Couldn't find descendants in story: " + str(story))
    title = story['title']

    story_unix_time = story['time']
    story_datetime = datetime.fromtimestamp(story_unix_time)
    
    timedelta_since_post = get_td_from_story(story)
    comments_per_minute = get_comments_per_minute_from_story(timedelta_since_post, num_comments)
    time_since_post = format_timedelta(timedelta_since_post)
    if num_comments > 0:
        item_timestamps = get_item_timestamps_for_all_kids(story, 0.1)
    else:
        item_timestamps = []
    for timestamp in item_timestamps:
        comment_timeseries_records.append({'story_id': story_id, 
                                           'story_title': title,
                                           'comment_timestamp': datetime.fromtimestamp(timestamp)})
    story_info_tracker[story_id] = {
        'story_id': story_id,
        'num_comments': num_comments,
        'title': title,
        'timedelta_since_post': timedelta_since_post,
        'time_since_post': time_since_post,
        'comments_per_minute': comments_per_minute,
        'story_datetime': story_datetime,
        'url': human_display_item_url + str(story_id),
        'comment_timestamps': item_timestamps,
    }
    print("Pulled info for story " + str(i) + "/" + str(num_stories))

Pulled info for story 0/500
Pulled info for story 1/500
Pulled info for story 2/500
Pulled info for story 3/500
Pulled info for story 4/500
Pulled info for story 5/500
Pulled info for story 6/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 7/500
Pulled info for story 8/500
Pulled info for story 9/500
Pulled info for story 10/500
Pulled info for story 11/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 12/500
Pulled info for story 13/500
Pulled info for story 14/500
Pulled info for story 15/500
Pulled info for story 16/500
Pulled info for story 17/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 18/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 19/500
Pulled info for story 20/500
Pulled info for story 21/500
Pulled info for story 22/500
Pulled info for story 23/500
Pulled info for story 24/500
Pulled info for story 25/500
Pulled info for story 26/500
Pulled info for sto

In [51]:
comment_timeseries_df = pd.DataFrame.from_records(comment_timeseries_records)

comment_timeseries_df.to_csv('comment_timeseries_df.csv')

In [52]:
# Gets comments per minute from an array of unix timestamps representing comment times, and minutes in the past to consider recent.
def get_comments_per_minute_from_recent_comments(comment_unix_timestamp_array, minutes_to_consider_recent):
    recency_datetime = datetime.now() - timedelta(minutes=minutes_to_consider_recent)
    filtered_comments = [comment for comment in comment_unix_timestamp_array if comment >= recency_datetime.timestamp()]
    filtered_comments_sorted = sorted(filtered_comments)
    if len(filtered_comments_sorted) <= 1:
        return 0
    first_comment_timestamp = datetime.fromtimestamp(filtered_comments_sorted[0])
    last_comment_timestamp = datetime.fromtimestamp(filtered_comments_sorted[-1])
    timedelta_between_first_last_comment = last_comment_timestamp - first_comment_timestamp
    minutes_between_first_last_comment = (timedelta_between_first_last_comment.seconds * 1.0) / 60
    return (len(comment_unix_timestamp_array) * 1.0) / minutes_between_first_last_comment

def get_stories_with_recent_comment_velocity(stories, recency_in_minutes):
    stories_with_recent_comment_velocity = {}
    for story_id, story in stories.items():
        recent_comment_velocity = get_comments_per_minute_from_recent_comments(story['comment_timestamps'], recency_in_minutes)
        story['recent_comments_per_minute'] = recent_comment_velocity
        stories_with_recent_comment_velocity[story_id] = story
    return stories_with_recent_comment_velocity

def convert_sorted_stories_to_display_list(sorted_stories):
    original_list = [sorted_story[1] for sorted_story in sorted_stories]
    display_list = []
    for story in original_list:
        display_story = story.copy()
        if 'comment_timestamps' in display_story:
            del display_story['comment_timestamps']
        display_list.append(display_story)
    return display_list

def get_recent(sorted_stories_display_list, minutes_to_consider_recent):
    return [sorted_story for sorted_story in sorted_stories_display_list \
            if (sorted_story['timedelta_since_post'].seconds / 60) < minutes_to_consider_recent]


###
###
###

## EDIT THIS TO CHANGE DEFINITION OF "RECENT COMMENT"
MINUTES_TO_CONSIDER_RECENT = 120


###
###
###

## SORT AND FORMAT STORIES BY RELEVANT METRICS

story_info_tracker = get_stories_with_recent_comment_velocity(story_info_tracker, MINUTES_TO_CONSIDER_RECENT)

num_comments_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['num_comments'], reverse=True)
comment_velocity_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['comments_per_minute'], reverse=True)
recent_comment_velocity_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['recent_comments_per_minute'], reverse=True)

num_comments_sorted_stories_display = convert_sorted_stories_to_display_list(num_comments_sorted_stories)
comment_velocity_sorted_stories_display = convert_sorted_stories_to_display_list(comment_velocity_sorted_stories)
recent_comment_velocity_sorted_stories_display = convert_sorted_stories_to_display_list(recent_comment_velocity_sorted_stories)

In [53]:
# get_recent(comment_velocity_sorted_stories_display, (3 * 60))
# get_recent(num_comments_sorted_stories_display, (3*60))

# recent_comment_velocity_sorted_stories_display

get_recent(comment_velocity_sorted_stories_display, (6 * 60))

[{'story_id': 37379534,
  'num_comments': 56,
  'title': 'Reddit faces content quality concerns after its Great Mod Purge',
  'timedelta_since_post': datetime.timedelta(seconds=6760, microseconds=436283),
  'time_since_post': '1h52m',
  'comments_per_minute': 0.4970414201183432,
  'story_datetime': datetime.datetime(2023, 9, 4, 5, 41, 30),
  'url': 'https://news.ycombinator.com/item?id=37379534',
  'recent_comments_per_minute': 0.6939003917179631},
 {'story_id': 37379801,
  'num_comments': 32,
  'title': 'I built a Plane Spotter in 120 secs with ChatGPT',
  'timedelta_since_post': datetime.timedelta(seconds=4219, microseconds=7076),
  'time_since_post': '1h10m',
  'comments_per_minute': 0.4550841431618867,
  'story_datetime': datetime.datetime(2023, 9, 4, 6, 23, 38),
  'url': 'https://news.ycombinator.com/item?id=37379801',
  'recent_comments_per_minute': 1.1592505854800939},
 {'story_id': 37379272,
  'num_comments': 52,
  'title': 'TV Museum Will Die in 48 Hours Unless Sony Retracts Y

In [54]:
comment_velocity_df = pd.DataFrame.from_records(comment_velocity_sorted_stories_display)
recent_comment_velocity_df = pd.DataFrame.from_records(comment_velocity_sorted_stories_display)

In [55]:
comment_velocity_df

Unnamed: 0,story_id,num_comments,title,timedelta_since_post,time_since_post,comments_per_minute,story_datetime,url,recent_comments_per_minute
0,37379534,56,Reddit faces content quality concerns after it...,0 days 01:52:40.436283,1h52m,0.497041,2023-09-04 05:41:30,https://news.ycombinator.com/item?id=37379534,0.693900
1,37379801,32,I built a Plane Spotter in 120 secs with ChatGPT,0 days 01:10:19.007076,1h10m,0.455084,2023-09-04 06:23:38,https://news.ycombinator.com/item?id=37379801,1.159251
2,37379272,52,TV Museum Will Die in 48 Hours Unless Sony Ret...,0 days 02:32:23.449544,2h32m,0.341245,2023-09-04 05:02:11,https://news.ycombinator.com/item?id=37379272,0.564952
3,37378669,80,The curl-wget Venn diagram,0 days 04:13:24.294482,4h13m,0.315706,2023-09-04 03:21:38,https://news.ycombinator.com/item?id=37378669,0.896940
4,37376686,150,Glove80 Ergonomic Keyboard,0 days 10:47:10.732236,10h47m,0.231780,2023-09-03 20:52:04,https://news.ycombinator.com/item?id=37376686,1.538202
...,...,...,...,...,...,...,...,...,...
495,37373237,0,Court Rules on Civilian Drones Used to Record ...,0 days 18:51:34.285422,18h51m,0.000000,2023-09-03 12:59:14,https://news.ycombinator.com/item?id=37373237,0.000000
496,37373180,0,Using ZRAM Instead of Swap,0 days 18:58:40.026572,18h58m,0.000000,2023-09-03 12:52:53,https://news.ycombinator.com/item?id=37373180,0.000000
497,37373177,0,Russia Linked Hackers Attack UK Ministry of De...,0 days 18:58:58.170254,18h58m,0.000000,2023-09-03 12:52:35,https://news.ycombinator.com/item?id=37373177,0.000000
498,37373150,0,Ask HN: Where do I get bank card BINs?,0 days 19:01:01.317053,19h1m,0.000000,2023-09-03 12:50:32,https://news.ycombinator.com/item?id=37373150,0.000000


In [None]:
c = (alt.Chart(comment_velocity_df)
        .mark_area(opacity=0.3)
        .encode(
            x='timedelta_since_post',
            y='comments_per_minute',
            color='title',
    ))
st.altair_chart(c, use_container_width=True)

In [None]:
st.dataframe(comment_velocity_df)

In [39]:
recent_comment_velocity_df.to_csv('recent_comment_velocity_df.csv')

In [19]:
time.max

datetime.time(23, 59, 59, 999999)

In [101]:
comment_timeseries_csv_filename = 'comment_timeseries_0.1.csv'
stories_csv_filename = 'stories_0.1.csv'

story_parse_dates = ['story_datetime']
comment_parse_dates = ['comment_timestamp']
story_df = pd.read_csv(stories_csv_filename, parse_dates=story_parse_dates)
comment_df = pd.read_csv(comment_timeseries_csv_filename, parse_dates=comment_parse_dates)

recent_comment_metrics = comment_df.groupby(
        ['story_id']
    ).agg(
        {
            'comment_timestamp': ['min', 'count'],
        }
    )

# groupby / agg with multiple aggs gives a multindex, need to drop a level to join
recent_comment_metrics.columns = recent_comment_metrics.columns.droplevel(0)
recent_comment_metrics = recent_comment_metrics.rename(
    {'min': 'earliest_recent_comment_timestamp', 'count': 'num_recent_comments'},
    axis=1
)

recent_comment_metrics
#joined_df = story_df.merge(recent_comment_metrics, on=['story_id'])

#joined_df


Unnamed: 0_level_0,earliest_recent_comment_timestamp,num_recent_comments
story_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37373410,2023-09-03 14:03:49,10
37373503,2023-09-03 13:58:36,2
37373560,2023-09-03 13:43:13,6
37373635,2023-09-03 14:05:29,61
37373637,2023-09-03 13:40:23,1
...,...,...
37380303,2023-09-04 07:34:33,1
37380343,2023-09-04 07:37:50,1
37380353,2023-09-04 07:47:05,3
37380464,2023-09-04 07:53:57,1
