In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# Load data and drop redundant columns

data = pd.read_csv('r_news_data.csv')
data = data.drop(['Unnamed: 0', 'level_0', 'index'], axis=1)
data.head()

Unnamed: 0,Post ID,Title,Url,Author,Score,Publish Date,Total No. of Comments,Permalink,Flair
0,ko10lt,Brexit Becomes Reality As UK Leaves European U...,https://popularnews.in/brexit-becomes-reality-...,popularnewsindia,1,2020-12-31 16:00:01,0,/r/news/comments/ko10lt/brexit_becomes_reality...,
1,ko10oh,COVID-19 UPDATE: Take-out and Delivery OPEN. N...,https://siliconeer.com/current/raj-palace-rest...,siliconeer,1,2020-12-31 16:00:06,0,/r/news/comments/ko10oh/covid19_update_takeout...,
2,ko10sx,Lamborghini Huracan STO Teased Ahead Of Debut,https://popularnews.in/lamborghini-huracan-sto...,popularnewsindia,1,2020-12-31 16:00:15,0,/r/news/comments/ko10sx/lamborghini_huracan_st...,
3,ko11ms,"""It's just utter chaos"": California becomes th...",https://www.cbsnews.com/news/california-become...,Bonboniru,1,2020-12-31 16:01:21,269,/r/news/comments/ko11ms/its_just_utter_chaos_c...,
4,ko11u9,The Number Theory| A decade of rightward shift...,https://popularnews.in/the-number-theory-a-dec...,popularnewsindia,1,2020-12-31 16:01:38,0,/r/news/comments/ko11u9/the_number_theory_a_de...,


In [3]:
# Split data into duplicate URLs and non-duplicates
# Made the decision to Sum scores and comments
# This way we still treat separte engagements as measures of virality

data_dup = pd.DataFrame(data)
data_dup['duplicate'] = data_dup.duplicated(subset=['Url'], keep=False)
data_dedup = data_dup[data_dup['duplicate'] == True]
data_solo = data_dup[data_dup['duplicate'] == False]
data_dedup

Unnamed: 0,Post ID,Title,Url,Author,Score,Publish Date,Total No. of Comments,Permalink,Flair,duplicate
1,ko10oh,COVID-19 UPDATE: Take-out and Delivery OPEN. N...,https://siliconeer.com/current/raj-palace-rest...,siliconeer,1,2020-12-31 16:00:06,0,/r/news/comments/ko10oh/covid19_update_takeout...,,True
7,ko12s6,The Mutated Virus Is a Ticking Time Bomb,https://www.theatlantic.com/science/archive/20...,Cagey898,1,2020-12-31 16:03:13,0,/r/news/comments/ko12s6/the_mutated_virus_is_a...,,True
10,ko14yu,COVID-19 UPDATE: We are open with limited hour...,https://siliconeer.com/current/farm-fresh-prod...,siliconeer,1,2020-12-31 16:06:49,0,/r/news/comments/ko14yu/covid19_update_we_are_...,,True
12,ko15dt,COVID-19 UPDATE: Take-out and Delivery OPEN. N...,https://siliconeer.com/current/chaat-house-del...,siliconeer,1,2020-12-31 16:07:30,0,/r/news/comments/ko15dt/covid19_update_takeout...,,True
28,ko1emx,Employees at suburban Milwaukee clinic unknowi...,https://www.chicagotribune.com/coronavirus/ct-...,mykl66,1,2020-12-31 16:23:02,0,/r/news/comments/ko1emx/employees_at_suburban_...,,True
...,...,...,...,...,...,...,...,...,...,...
230966,ncg3gl,E-2 Visa Businesses for sale in Florida,https://www.homemaxrealtyinternational.com/new...,homemaxrealty,1,2021-05-14 11:56:25,0,/r/news/comments/ncg3gl/e2_visa_businesses_for...,,True
230984,ncgcl7,"Over 1,000 people a day are moving to Florida",https://www.homemaxrealtyinternational.com/new...,homemaxrealty,1,2021-05-14 12:07:22,0,/r/news/comments/ncgcl7/over_1000_people_a_day...,,True
231760,ncu4b6,24 Nostalgia,https://youtube.com/watch?v=ahZKG08eVvA&amp;fe...,andressh,1,2021-05-15 01:32:53,2,/r/news/comments/ncu4b6/24_nostalgia/,,True
231768,ncu6ru,42 Alucinante 20 08 2018,https://youtube.com/watch?v=8dzTWsXd4F4&amp;fe...,andressh,1,2021-05-15 01:37:39,2,/r/news/comments/ncu6ru/42_alucinante_20_08_2018/,,True


In [4]:
# Sum score, comments for duplicate URLs

data_dedup_urls = data_dedup.groupby(by=['Url']).sum()
data_dedup_urls

Unnamed: 0_level_0,Score,Total No. of Comments,duplicate
Url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http://amingoapp.site/?p=31,2,0,2
http://beonebiz.com/covid-19/joe-biden-1-9-trillion-covid-19-relief-bill/,2,0,2
http://decoratiuneagoogle.demo.ro,4,0,4
http://dlsharefile.com/file/MTkxNGRkNTkt,2,0,2
http://fvdigital.do/2021/02/10/podran-recibir-7-mil-dolares-reembolsos-familias-en-ny-perdieran-un-pariente-por-covid-19/,2,0,2
...,...,...,...
https://youtube.com/watch?v=oXEIzGjJWKo&amp;feature=share,3,6,3
https://youtube.com/watch?v=pxfgj7_GisQ&amp;feature=share,2,0,2
https://youtube.com/watch?v=wJyWWwEQOy0&amp;feature=share,1,1,2
https://youtube.com/watch?v=wm1_UBreElc&amp;feature=share,2,4,2


In [5]:
# Rejoin Sums with dataset again
# Final result is a deduplicated data set with aggregated Scores and # Comments
# Publish Date was kept as first posted, not most recent
# Most appear to have been ads

data_dedup = data_dedup.join(data_dedup_urls, on=['Url'], how='right',rsuffix='_dup')
data_dedup = data_dedup.drop(['Score', 'Total No. of Comments', 'duplicate', 'duplicate_dup'], axis=1)
data_dedup = data_dedup[['Post ID','Title','Url','Author','Score_dup','Publish Date','Total No. of Comments_dup','Permalink','Flair']]
data_dedup = data_dedup.rename(columns={'Score_dup':"Score","Total No. of Comments_dup":'Total No. of Comments'})
data_dedup['Publish Date'] = pd.to_datetime(data_dedup['Publish Date'])
data_dedup.sort_values(by="Publish Date", inplace=True)
data_dedup.drop_duplicates(subset='Url', keep='first', inplace=True)
data_dedup

Unnamed: 0,Post ID,Title,Url,Author,Score,Publish Date,Total No. of Comments,Permalink,Flair
1,ko10oh,COVID-19 UPDATE: Take-out and Delivery OPEN. N...,https://siliconeer.com/current/raj-palace-rest...,siliconeer,20,2020-12-31 16:00:06,0,/r/news/comments/ko10oh/covid19_update_takeout...,
7,ko12s6,The Mutated Virus Is a Ticking Time Bomb,https://www.theatlantic.com/science/archive/20...,Cagey898,2,2020-12-31 16:03:13,0,/r/news/comments/ko12s6/the_mutated_virus_is_a...,
10,ko14yu,COVID-19 UPDATE: We are open with limited hour...,https://siliconeer.com/current/farm-fresh-prod...,siliconeer,24,2020-12-31 16:06:49,0,/r/news/comments/ko14yu/covid19_update_we_are_...,
12,ko15dt,COVID-19 UPDATE: Take-out and Delivery OPEN. N...,https://siliconeer.com/current/chaat-house-del...,siliconeer,22,2020-12-31 16:07:30,0,/r/news/comments/ko15dt/covid19_update_takeout...,
28,ko1emx,Employees at suburban Milwaukee clinic unknowi...,https://www.chicagotribune.com/coronavirus/ct-...,mykl66,2,2020-12-31 16:23:02,0,/r/news/comments/ko1emx/employees_at_suburban_...,
...,...,...,...,...,...,...,...,...,...
205675,n2mad1,UV Sanitizer Light - Portable Ultraviolet Ligh...,https://runshinemall.com/collections/uv-saniti...,Status_Pound_9962,2,2021-05-01 09:54:04,0,/r/news/comments/n2mad1/uv_sanitizer_light_por...,
207611,n3a82s,Cities and states across the US are promising ...,https://www.yahoo.com/news/cities-states-acros...,shallah,2,2021-05-02 09:36:48,0,/r/news/comments/n3a82s/cities_and_states_acro...,
210166,n48ptn,"Pete Lammons, Who Helped the Jets Win ’69 Trem...",https://newsnationglobal.com/2021/05/03/pete-l...,newsnationglobal,2,2021-05-03 15:04:43,0,/r/news/comments/n48ptn/pete_lammons_who_helpe...,
214253,n5lg39,National Guard soldier charged for storming th...,https://www.cnn.com/2021/05/05/politics/wiscon...,7MCMXC,2,2021-05-05 10:26:45,3348,/r/news/comments/n5lg39/national_guard_soldier...,


In [6]:
# Append deduped cells back to unique cells
# Left in duplicate boolean in case it's interesting

data = pd.DataFrame(data_solo.append(data_dedup, ignore_index=True))
data['duplicate'] = data['duplicate'].fillna(True)

In [7]:
data.describe()

Unnamed: 0,Score,Total No. of Comments
count,225583.0,225583.0
mean,18.351334,15.837607
std,864.779084,262.998069
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,103302.0,41107.0


In [8]:
# Create aggregate column for Score and Comments, to see what has effectively zero engagement
data['Engagement'] = data['Score'] + data['Total No. of Comments']

In [9]:
data = pd.DataFrame(data.sort_values(by='Engagement', ascending=False)).reset_index(drop=True)

In [10]:
# Set Engagement cut point to 10 

data_top_engagement = data[data['Engagement'] > 9]
data_top_engagement

Unnamed: 0,Post ID,Title,Url,Author,Score,Publish Date,Total No. of Comments,Permalink,Flair,duplicate,Engagement
0,lr3xap,Man dies after police kneel on his neck for ne...,https://amp.cnn.com/cnn/2021/02/23/us/angelo-q...,monaleeparis,100226,2021-02-23 20:43:30,17754,/r/news/comments/lr3xap/man_dies_after_police_...,,False,117980
1,lsmh36,Texan files $1 billion class-action lawsuit af...,https://abcnews.go.com/US/texan-files-billion-...,ACABBLM2020,97551,2021-02-25 17:30:11,14574,/r/news/comments/lsmh36/texan_files_1_billion_...,,False,112125
2,ls7xmj,Trump tax returns are now in the hands of the ...,https://www.cnbc.com/2021/02/25/trump-tax-retu...,[deleted],98318,2021-02-25 06:38:18,12719,/r/news/comments/ls7xmj/trump_tax_returns_are_...,,True,111037
3,laun1r,Minneapolis police officers must keep body cam...,https://www.cnn.com/2021/02/02/us/minneapolis-...,dlkapt3,103302,2021-02-02 04:42:35,7312,/r/news/comments/laun1r/minneapolis_police_off...,,False,110614
4,lsbgop,Costco lifts minimum wage above Amazon or Targ...,https://www.reuters.com/article/us-costco-whol...,candordirect,99299,2021-02-25 09:11:08,8124,/r/news/comments/lsbgop/costco_lifts_minimum_w...,,False,107423
...,...,...,...,...,...,...,...,...,...,...,...
11235,mxf36e,Bones of Black children killed in police bombi...,https://www.theguardian.com/us-news/2021/apr/2...,mod_89,1,2021-04-24 00:35:28,9,/r/news/comments/mxf36e/bones_of_black_childre...,,False,10
11236,mjb8qw,A juvenile has been arrested following a video...,https://www.cnn.com/2021/04/03/us/asian-couple...,Piranha_ChuckNorris,5,2021-04-03 08:57:30,5,/r/news/comments/mjb8qw/a_juvenile_has_been_ar...,,False,10
11237,n3oquf,(Australia) 80yo fisher finds himself trapped ...,https://www.abc.net.au/news/2021-05-03/fisherm...,LuckyBdx4,1,2021-05-02 22:29:06,9,/r/news/comments/n3oquf/australia_80yo_fisher_...,,False,10
11238,n545c8,Minorities underrepresented in vaccinated popu...,https://www.kxly.com/minorities-underrepresent...,MasterRazz,1,2021-05-04 18:40:19,9,/r/news/comments/n545c8/minorities_underrepres...,,False,10


In [11]:
data_top_engagement.to_csv('r_news_top_jan_to_may_2021.csv')