Analyzing Hacker News Posts
========

This project analyzes the posts in Hacker News posts based on points, comments and titles.

In [159]:
from csv import reader
opened_file = open("hacker_news.csv")
read_file = reader(opened_file)
hn = list(read_file)

In [160]:
print(hn[:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]


In [161]:
headers = hn[0]

In [162]:
hn = hn[1:]

In [163]:
print(headers)

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


In [164]:
print(hn[:5])

[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]


In [165]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith("ask hn"):
        ask_posts.append(row)
    elif title.lower().startswith("show hn"):
        show_posts.append(row)
    else:
        other_posts.append(row)

In [166]:
len(ask_posts)

1744

In [167]:
len(show_posts)

1162

In [168]:
len(other_posts)

17194

In [169]:
print(ask_posts[:5])

[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', '5/2/2016 10:14'], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', '8/2/2016 14:20'], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', '10/15/2015 16:38']]


In [170]:
print(show_posts[:5])

[['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', '11/25/2015 14:03'], ['10646440', 'Show HN: Something pointless I made', 'http://dn.ht/picklecat/', '747', '102', 'dhotson', '11/29/2015 22:46'], ['11590768', 'Show HN: Shanhu.io, a programming playground powered by e8vm', 'https://shanhu.io', '1', '1', 'h8liu', '4/28/2016 18:05'], ['12178806', 'Show HN: Webscope  Easy way for web developers to communicate with Clients', 'http://webscopeapp.com', '3', '3', 'fastbrick', '7/28/2016 7:11'], ['10872799', 'Show HN: GeoScreenshot  Easily test Geo-IP based web pages', 'https://www.geoscreenshot.com/', '1', '9', 'kpsychwave', '1/9/2016 20:45']]


We want to determine which type of posts receives more comments on average. Below we calculated the average number of comments for ask posts and show posts respectively.

In [171]:
total_ask_comments = 0

for row in ask_posts:
    total_ask_comments += int(row[4])

In [172]:
total_ask_comments

24483

In [173]:
avg_ask_comments = total_ask_comments/len(ask_posts)

In [174]:
print(avg_ask_comments)

14.038417431192661


Now, lets do the same for show posts.

In [175]:
total_show_comments = 0

for row in show_posts:
    total_show_comments += int(row[4])
    
avg_show_comments = total_show_comments/len(show_posts)

In [176]:
print(avg_show_comments)

10.31669535283993


According the the results, ask posts receive more comments on average. However, the average number of comments does not have huge difference with each other.

Since ask posts receive more comments on average, we'll focus our analysis just on ask posts.

Next, we want to analyze the amount of ask posts created in each hour of the day, along with the number of comments received.

In [177]:
import datetime as dt

In [178]:
result_list = []

for row in ask_posts:
    created_at = row[6]
    comments = int(row[4])
    result_list.append([created_at,comments])
    

In [179]:
print(result_list[:4])

[['8/16/2016 9:55', 6], ['11/22/2015 13:43', 29], ['5/2/2016 10:14', 1], ['8/2/2016 14:20', 3]]


In [180]:
counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    date = row[0]
    n_comment = row[1]   
    time = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
    time = dt.datetime.strftime(time, "%H")     
    if time not in counts_by_hour:
        counts_by_hour[time] = 1
        comments_by_hour[time] = n_comment
    else:
        counts_by_hour[time] += 1
        comments_by_hour[time] += n_comment

In [181]:
print(comments_by_hour)

{'00': 447, '13': 1253, '17': 1146, '04': 337, '22': 479, '02': 1381, '07': 267, '21': 1745, '03': 421, '14': 1416, '15': 4477, '06': 397, '08': 492, '23': 543, '12': 687, '10': 793, '16': 1814, '05': 464, '19': 1188, '09': 251, '18': 1439, '11': 641, '01': 683, '20': 1722}


Next, we will calculate the average number of comments per post for posts created during each hour of the day.

In [182]:
avg_by_hour = []

for hour in comments_by_hour:
    avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])

In [183]:
avg_by_hour

[['00', 8.127272727272727],
 ['13', 14.741176470588234],
 ['17', 11.46],
 ['04', 7.170212765957447],
 ['22', 6.746478873239437],
 ['02', 23.810344827586206],
 ['07', 7.852941176470588],
 ['21', 16.009174311926607],
 ['03', 7.796296296296297],
 ['14', 13.233644859813085],
 ['15', 38.5948275862069],
 ['06', 9.022727272727273],
 ['08', 10.25],
 ['23', 7.985294117647059],
 ['12', 9.41095890410959],
 ['10', 13.440677966101696],
 ['16', 16.796296296296298],
 ['05', 10.08695652173913],
 ['19', 10.8],
 ['09', 5.5777777777777775],
 ['18', 13.20183486238532],
 ['11', 11.051724137931034],
 ['01', 11.383333333333333],
 ['20', 21.525]]

We found the average number of comments for posts created during each hour of the day, next we will sort the list and print the five highest values in a more easy to read way.

In [184]:
swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
print(swap_avg_by_hour)

[[8.127272727272727, '00'], [14.741176470588234, '13'], [11.46, '17'], [7.170212765957447, '04'], [6.746478873239437, '22'], [23.810344827586206, '02'], [7.852941176470588, '07'], [16.009174311926607, '21'], [7.796296296296297, '03'], [13.233644859813085, '14'], [38.5948275862069, '15'], [9.022727272727273, '06'], [10.25, '08'], [7.985294117647059, '23'], [9.41095890410959, '12'], [13.440677966101696, '10'], [16.796296296296298, '16'], [10.08695652173913, '05'], [10.8, '19'], [5.5777777777777775, '09'], [13.20183486238532, '18'], [11.051724137931034, '11'], [11.383333333333333, '01'], [21.525, '20']]


In [185]:
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

In [186]:
sorted_swap[:5]

[[38.5948275862069, '15'],
 [23.810344827586206, '02'],
 [21.525, '20'],
 [16.796296296296298, '16'],
 [16.009174311926607, '21']]

In [187]:
print("Top 5 Hours for Ask Posts Comments")

Top 5 Hours for Ask Posts Comments


In [188]:
for i in sorted_swap[:5]:
    comments = float(i[0])
    hour = dt.datetime.strptime(str(i[1]), "%H")
    hour = dt.date.strftime(hour, "%H:%M")
    output="{}: {:.2f} average comments per post".format(hour,comments)
    print(output)

15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post


The result shows us that most comments are received at 15 followed by 2. It looks like it's best to creat a post around 14-15. The average number of comments is way more higher than other hours of the day. Evening is also would be my second choice to create a post.

In [189]:
total_ask_points = 0

for row in ask_posts:
    total_ask_points += int(row[3])

In [190]:
avg_ask_points = total_ask_points/len(ask_posts)

In [191]:
print(avg_ask_points)

15.061926605504587


In [192]:
total_show_points = 0

for row in show_posts:
    total_show_points += int(row[3])
    
avg_show_points = total_show_points / len(show_posts)

In [193]:
print(avg_show_points)

27.555077452667813


It looks like show posts receive much higher points on average than ask posts.

Next, we will determine if posts created at a certain time are more likely to receive more points for show posts.

In [194]:
show_time_points_list = []

for row in show_posts:
    created = row[6]
    points = int(row[3])
    show_time_points_list.append([created, points])

In [195]:
print(show_time_points_list[:4])

[['11/25/2015 14:03', 26], ['11/29/2015 22:46', 747], ['4/28/2016 18:05', 1], ['7/28/2016 7:11', 3]]


In [196]:
points_by_hour_show = {}

for i in show_time_points_list:
    date = i[0]
    points = i[1]
    time = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
    time = dt.datetime.strftime(time, "%H")
    if time in points_by_hour_show:
        points_by_hour_show[time] += points
    else:
        points_by_hour_show[time] = points

In [197]:
print(points_by_hour_show)

{'00': 1173, '06': 375, '17': 2521, '04': 386, '12': 2543, '02': 340, '07': 494, '21': 866, '03': 679, '14': 2187, '15': 2228, '13': 2438, '08': 519, '23': 1526, '22': 1856, '10': 681, '16': 2634, '05': 104, '19': 1702, '09': 553, '18': 2215, '11': 1480, '01': 700, '20': 1819}


In [198]:
rev_points = []

for hour in points_by_hour_show:
    rev_points.append([points_by_hour_show[hour], hour])

In [199]:
print(rev_points)

[[1173, '00'], [375, '06'], [2521, '17'], [386, '04'], [2543, '12'], [340, '02'], [494, '07'], [866, '21'], [679, '03'], [2187, '14'], [2228, '15'], [2438, '13'], [519, '08'], [1526, '23'], [1856, '22'], [681, '10'], [2634, '16'], [104, '05'], [1702, '19'], [553, '09'], [2215, '18'], [1480, '11'], [700, '01'], [1819, '20']]


In [200]:
points_list = sorted(rev_points, reverse=True)

In [201]:
print(points_list[:5])

[[2634, '16'], [2543, '12'], [2521, '17'], [2438, '13'], [2228, '15']]


In [202]:
print("Top 5 Hours for Show Posts Points")

Top 5 Hours for Show Posts Points


In [203]:
for i in points_list[:5]:
    points = int(i[0])
    time = i[1]
    hour = dt.datetime.strptime(time, "%H")
    hour = dt.datetime.strftime(hour, "%H:%M")
    print("{}: {} average points received.".format(hour,points))

16:00: 2634 average points received.
12:00: 2543 average points received.
17:00: 2521 average points received.
13:00: 2438 average points received.
15:00: 2228 average points received.


We can see that the posts created afternoon receives the highest points on average. If you are going to create a show post you should post it arounda 14 or 15 to receive the highest points. The must be the time when the site has the highest number of online readers.

Next, we will do the same analysis for ask posts.

In [204]:
ask_time_points_list = []

for row in ask_posts:
    created = row[6]
    points = int(row[3])
    ask_time_points_list.append([created, points])

In [205]:
ask_time_points_list[:4]

[['8/16/2016 9:55', 2],
 ['11/22/2015 13:43', 28],
 ['5/2/2016 10:14', 1],
 ['8/2/2016 14:20', 1]]

In [206]:
points_by_hour_ask = {}

for i in ask_time_points_list:
    date = i[0]
    points = i[1]
    time = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
    time = dt.datetime.strftime(time, "%H")
    if time in points_by_hour_ask:
        points_by_hour_ask[time] += 1
    else:
        points_by_hour_ask[time] = 1

In [207]:
points_by_hour_ask

{'00': 55,
 '01': 60,
 '02': 58,
 '03': 54,
 '04': 47,
 '05': 46,
 '06': 44,
 '07': 34,
 '08': 48,
 '09': 45,
 '10': 59,
 '11': 58,
 '12': 73,
 '13': 85,
 '14': 107,
 '15': 116,
 '16': 108,
 '17': 100,
 '18': 109,
 '19': 110,
 '20': 80,
 '21': 109,
 '22': 71,
 '23': 68}

In [208]:
rev_ask_list = []

for hour in points_by_hour_ask:
    rev_ask_list.append([points_by_hour_ask[hour],hour])
    
rev_ask_list[:5]

[[55, '00'], [85, '13'], [100, '17'], [47, '04'], [71, '22']]

In [209]:
sor_rev_ask_list = sorted(rev_ask_list, reverse=True)

In [210]:
sor_rev_ask_list[:5]

[[116, '15'], [110, '19'], [109, '21'], [109, '18'], [108, '16']]

In [211]:
print("Top 5 Hours for Ask Posts Points")

Top 5 Hours for Ask Posts Points


In [213]:
for i in sor_rev_ask_list[:5]:
    points = int(i[0])
    time = i[1]
    hour = dt.datetime.strptime(time, "%H")
    hour = dt.datetime.strftime(hour, "%H:%M")
    print("{}: {} points received on average.".format(hour, points))

15:00: 116 points received on average.
19:00: 110 points received on average.
21:00: 109 points received on average.
18:00: 109 points received on average.
16:00: 108 points received on average.


It looks like ask posts receive the most points around evening time. Also the average points received is much lower than show posts.