In [23]:
import requests as r
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd

'''
All HN API requests are constructed using the following reference: https://github.com/HackerNews/API
'''

base_url = "https://hacker-news.firebaseio.com/v0/"
human_display_item_url = "https://news.ycombinator.com/item?id="
new_stories_slug = "newstories"
item_slug = "item/"
json_append_slug = ".json"

In [24]:
### Get Story IDs for the newest stories from the HN New Stories endpoint
new_stories_response = r.get(base_url + new_stories_slug + json_append_slug)
new_stories_json = new_stories_response.json()

In [32]:
# returns datetime.timedelta object representing time between execution and story posting
def get_td_from_story(story):
    story_unix_time = story['time']
    story_datetime = datetime.fromtimestamp(story_unix_time)
    timedelta_since_post = datetime.now() - story_datetime
    return timedelta_since_post

def get_comments_per_minute_from_story(timedelta_since_post, num_comments):
    minutes_since_post = timedelta_since_post.seconds / 60
    comments_per_minute = (num_comments * 1.0) / minutes_since_post
    return comments_per_minute

def format_timedelta(timedelta_since_post):
    hours_since_post, remainder = divmod(timedelta_since_post.seconds, 3600)
    minutes_since_post, seconds = divmod(remainder, 60)
    time_since_post = str(hours_since_post) + 'h' + str(minutes_since_post) + 'm'
    return time_since_post

def get_comment_timestamps(story_json, delay_in_seconds):
    comments = story_json['kids']
    comment_timestamps = []
    for comment_id in comments:
        comment_response = r.get(base_url + item_slug + str(comment_id) + json_append_slug)
        comment = comment_response.json()
        comment_timestamps.append(comment['time'])
        time.sleep(delay_in_seconds)
    return comment_timestamps

def get_item_timestamps_for_all_kids(item_json, delay_in_seconds):
    if 'kids' not in item_json:
        return []
    kids = item_json['kids']
    print("Pulling timestamp info of kids for item with " + str(len(kids)) + " kids")
    kid_timestamps = []
    for kid_id in kids:
        kid_response = r.get(base_url + item_slug + str(kid_id) + json_append_slug)
        kid_json = kid_response.json()
        kid_timestamps.append(kid_json['time'])
        kid_timestamps = kid_timestamps + get_item_timestamps_for_all_kids(kid_json, delay_in_seconds)
        time.sleep(delay_in_seconds)
    return kid_timestamps

# Structure: story_id: {num_comments, }
story_info_tracker = {}
num_stories = len(new_stories_json)
comment_timeseries_records = []
for i, story_id in enumerate(new_stories_json):
    story_response = r.get(base_url + item_slug + str(story_id) + json_append_slug)
    story = story_response.json()
    if 'descendants' in story:
        num_comments = story['descendants']
    elif ('dead' in story and story['dead']):
        continue
    elif ('deleted' in story and story['deleted']):
        continue
    else:
        raise Exception("Couldn't find descendants in story: " + str(story))
    title = story['title']

    story_unix_time = story['time']
    story_datetime = datetime.fromtimestamp(story_unix_time)
    
    timedelta_since_post = get_td_from_story(story)
    comments_per_minute = get_comments_per_minute_from_story(timedelta_since_post, num_comments)
    time_since_post = format_timedelta(timedelta_since_post)
    if num_comments > 0:
        item_timestamps = get_item_timestamps_for_all_kids(story, 0.1)
    else:
        item_timestamps = []
    for timestamp in item_timestamps:
        comment_timeseries_records.append({'story_id': story_id, 
                                           'story_title': title, 
                                           'comment_timestamp': datetime.fromtimestamp(timestamp)})
    story_info_tracker[story_id] = {
        'story_id': story_id,
        'num_comments': num_comments,
        'title': title,
        'timedelta_since_post': timedelta_since_post,
        'time_since_post': time_since_post,
        'comments_per_minute': comments_per_minute,
        'story_datetime': story_datetime,
        'url': human_display_item_url + str(story_id),
        'comment_timestamps': item_timestamps,
    }
    print("Pulled info for story " + str(i) + "/" + str(num_stories))

Pulled info for story 0/500
Pulled info for story 1/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 2/500
Pulled info for story 3/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 4/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 5/500
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 6/500
Pulled info for story 7/500
Pulling timestamp info of kids for item with 3 kids
Pulling timestamp info of kids for item with 1 kids
Pulled info for story 8/500
Pulled info for story 9/500
Pulled info for story 10/500
Pulled info for story 11/500
Pulled info for story 12/500
Pulled info for story 13/500
Pulled info for story 14/500
Pulled info for story 15/500
Pulled info for story 16/500
Pulling timestamp info of kids for item with 7 kids
Pulling timestamp info of kids for item with 2 kids
Pulling timestamp info of kids for item with 5 kids
Pulling timestamp info of kids for item with 3 ki

In [34]:
comment_timeseries_df = pd.DataFrame.from_records(comment_timeseries_records)

comment_timeseries_df.to_csv('comment_timeseries_df.csv')

In [35]:
# Gets comments per minute from an array of unix timestamps representing comment times, and minutes in the past to consider recent.
def get_comments_per_minute_from_recent_comments(comment_unix_timestamp_array, minutes_to_consider_recent):
    recency_datetime = datetime.now() - timedelta(minutes=minutes_to_consider_recent)
    filtered_comments = [comment for comment in comment_unix_timestamp_array if comment >= recency_datetime.timestamp()]
    filtered_comments_sorted = sorted(filtered_comments)
    if len(filtered_comments_sorted) <= 1:
        return 0
    first_comment_timestamp = datetime.fromtimestamp(filtered_comments_sorted[0])
    last_comment_timestamp = datetime.fromtimestamp(filtered_comments_sorted[-1])
    timedelta_between_first_last_comment = last_comment_timestamp - first_comment_timestamp
    minutes_between_first_last_comment = (timedelta_between_first_last_comment.seconds * 1.0) / 60
    return (len(comment_unix_timestamp_array) * 1.0) / minutes_between_first_last_comment

def get_stories_with_recent_comment_velocity(stories, recency_in_minutes):
    stories_with_recent_comment_velocity = {}
    for story_id, story in stories.items():
        recent_comment_velocity = get_comments_per_minute_from_recent_comments(story['comment_timestamps'], recency_in_minutes)
        story['recent_comments_per_minute'] = recent_comment_velocity
        stories_with_recent_comment_velocity[story_id] = story
    return stories_with_recent_comment_velocity

def convert_sorted_stories_to_display_list(sorted_stories):
    original_list = [sorted_story[1] for sorted_story in sorted_stories]
    display_list = []
    for story in original_list:
        display_story = story.copy()
        if 'comment_timestamps' in display_story:
            del display_story['comment_timestamps']
        display_list.append(display_story)
    return display_list

def get_recent(sorted_stories_display_list, minutes_to_consider_recent):
    return [sorted_story for sorted_story in sorted_stories_display_list \
            if (sorted_story['timedelta_since_post'].seconds / 60) < minutes_to_consider_recent]


###
###
###

## EDIT THIS TO CHANGE DEFINITION OF "RECENT COMMENT"
MINUTES_TO_CONSIDER_RECENT = 120


###
###
###

## SORT AND FORMAT STORIES BY RELEVANT METRICS

story_info_tracker = get_stories_with_recent_comment_velocity(story_info_tracker, MINUTES_TO_CONSIDER_RECENT)

num_comments_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['num_comments'], reverse=True)
comment_velocity_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['comments_per_minute'], reverse=True)
recent_comment_velocity_sorted_stories = sorted(story_info_tracker.items(), key=lambda x: x[1]['recent_comments_per_minute'], reverse=True)

num_comments_sorted_stories_display = convert_sorted_stories_to_display_list(num_comments_sorted_stories)
comment_velocity_sorted_stories_display = convert_sorted_stories_to_display_list(comment_velocity_sorted_stories)
recent_comment_velocity_sorted_stories_display = convert_sorted_stories_to_display_list(recent_comment_velocity_sorted_stories)

In [36]:
# get_recent(comment_velocity_sorted_stories_display, (3 * 60))
# get_recent(num_comments_sorted_stories_display, (3*60))

# recent_comment_velocity_sorted_stories_display

get_recent(comment_velocity_sorted_stories_display, (6 * 60))

[{'num_comments': 68,
  'title': "Foxconn's promise to invest $10B in Wisconsin is now a distant memory",
  'timedelta_since_post': datetime.timedelta(seconds=10673, microseconds=202265),
  'time_since_post': '2h57m',
  'comments_per_minute': 0.382273025391174,
  'story_datetime': datetime.datetime(2023, 9, 3, 15, 54, 20),
  'url': 'https://news.ycombinator.com/item?id=37374966',
  'recent_comments_per_minute': 0.807633327653774},
 {'num_comments': 101,
  'title': 'Shorts risks cannibalising core YouTube business, say senior staff',
  'timedelta_since_post': datetime.timedelta(seconds=18510, microseconds=181266),
  'time_since_post': '5h8m',
  'comments_per_minute': 0.3273905996758509,
  'story_datetime': datetime.datetime(2023, 9, 3, 13, 45, 23),
  'url': 'https://news.ycombinator.com/item?id=37373710',
  'recent_comments_per_minute': 1.0790990047145101},
 {'num_comments': 75,
  'title': "Google preemptively banned hundreds of millions of 'pirate' URLs last year",
  'timedelta_since_p

In [37]:
comment_velocity_df = pd.DataFrame.from_records(comment_velocity_sorted_stories_display)
recent_comment_velocity_df = pd.DataFrame.from_records(comment_velocity_sorted_stories_display)

In [38]:
comment_velocity_df

Unnamed: 0,num_comments,title,timedelta_since_post,time_since_post,comments_per_minute,story_datetime,url,recent_comments_per_minute
0,297,Climate Change Tracker,0 days 10:09:29.194326,10h9m,0.487298,2023-09-03 08:53:21,https://news.ycombinator.com/item?id=37370900,3.997404
1,405,The boiling frog of digital freedom,0 days 16:19:38.451169,16h19m,0.413420,2023-09-03 02:49:37,https://news.ycombinator.com/item?id=37368824,4.729960
2,139,Tesla FSD Beta tried to kill me last night,0 days 06:02:50.303049,6h2m,0.383096,2023-09-03 12:52:58,https://news.ycombinator.com/item?id=37373182,1.593417
3,68,Foxconn's promise to invest $10B in Wisconsin ...,0 days 02:57:53.202265,2h57m,0.382273,2023-09-03 15:54:20,https://news.ycombinator.com/item?id=37374966,0.807633
4,101,Shorts risks cannibalising core YouTube busine...,0 days 05:08:30.181266,5h8m,0.327391,2023-09-03 13:45:23,https://news.ycombinator.com/item?id=37373710,1.079099
...,...,...,...,...,...,...,...,...
495,0,Ostriches Are So Horny for Humans It Interfere...,0 days 15:46:57.446114,15h46m,0.000000,2023-09-03 03:21:46,https://news.ycombinator.com/item?id=37368940,0.000000
496,0,OWASP Kubernetes Top Ten,0 days 16:06:33.483866,16h6m,0.000000,2023-09-03 03:02:39,https://news.ycombinator.com/item?id=37368877,0.000000
497,0,Event-Driven Architecture: A Beginner's Guide ...,0 days 16:08:55.626303,16h8m,0.000000,2023-09-03 03:00:17,https://news.ycombinator.com/item?id=37368869,0.000000
498,0,"Show HN: FastMJPG, low latency capture, transf...",0 days 16:15:55.163912,16h15m,0.000000,2023-09-03 02:53:20,https://news.ycombinator.com/item?id=37368840,0.000000


In [None]:
c = (alt.Chart(comment_velocity_df)
        .mark_area(opacity=0.3)
        .encode(
            x='timedelta_since_post',
            y='comments_per_minute',
            color='title',
    ))
st.altair_chart(c, use_container_width=True)

In [None]:
st.dataframe(comment_velocity_df)

In [39]:
recent_comment_velocity_df.to_csv('recent_comment_velocity_df.csv')

In [19]:
time.max

datetime.time(23, 59, 59, 999999)

In [47]:
experiment = comment_timeseries_df.groupby(
    ['story_id']
).agg(
    {
        'comment_timestamp': ['min', 'count'],
    }
)

experiment

Unnamed: 0_level_0,comment_timestamp,comment_timestamp
Unnamed: 0_level_1,min,count
story_id,Unnamed: 1_level_2,Unnamed: 2_level_2
37368824,2023-09-03 03:54:36,416
37368841,2023-09-03 05:39:52,9
37368913,2023-09-03 03:16:34,1
37368929,2023-09-03 03:52:38,110
37368977,2023-09-03 04:56:02,56
...,...,...
37375775,2023-09-03 17:56:28,4
37375822,2023-09-03 18:01:39,1
37375847,2023-09-03 18:25:04,1
37375868,2023-09-03 18:04:43,1
