In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()
from collections import defaultdict

In [2]:
sommoth_count = 1.0

In [3]:
technologies = ['kubernetes','linux','windows','solarwinds','garmin','aws','docker','github','wordpress','rundeck']

In [4]:
df = pd.read_json("data/hacker_news_data.json")
df = df.where(pd.notnull(df), None)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118030 entries, 0 to 118029
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   by           112685 non-null  object 
 1   descendants  86344 non-null   float64
 2   id           118030 non-null  int64  
 3   kids         33282 non-null   object 
 4   score        112685 non-null  float64
 5   time         118030 non-null  int64  
 6   title        112440 non-null  object 
 7   type         118030 non-null  object 
 8   url          106144 non-null  object 
 9   dead         27189 non-null   float64
 10  text         5821 non-null    object 
 11  deleted      5343 non-null    float64
 12  poll         245 non-null     float64
 13  parts        20 non-null      object 
dtypes: float64(5), int64(2), object(7)
memory usage: 12.6+ MB


In [8]:
df['datetime'] = df['time'].progress_apply(datetime.utcfromtimestamp) 

100%|█| 118030/118030 [00:00<00:00, 5589


In [9]:
df['month'] = df['datetime'].progress_apply(lambda article_datetime: article_datetime.month)

100%|█| 118030/118030 [00:00<00:00, 2009


In [11]:
df['datetime'].min(), df['datetime'].max()

(Timestamp('2021-01-01 03:10:08'), Timestamp('2021-04-12 14:05:21'))

In [None]:
#month 4 is partial so I am not using it.

In [13]:
df = df[df['month'].isin({1,2,3})]
len(df)

105845

In [14]:
def normalize_text(text) -> str:
    if text is None:
        return ''
    
    return text.lower()

def is_text_contains_technologies(text) -> pd.Series:
    is_contained = [technology in text for technology in technologies]
    
    return pd.Series(is_contained)

In [15]:
df['text'] = df['text'].progress_apply(normalize_text)

100%|█| 105845/105845 [00:00<00:00, 7416


In [16]:
df['title'] = df['title'].progress_apply(normalize_text)

100%|█| 105845/105845 [00:00<00:00, 6973


In [17]:
df['all_text'] = df.progress_apply(lambda r: r['title'] +' '+ r['text'], axis=1)

100%|█| 105845/105845 [00:01<00:00, 6445


In [18]:
df[technologies] = df['all_text'].progress_apply(is_text_contains_technologies)

100%|█| 105845/105845 [00:19<00:00, 5485


In [19]:
month_to_number_of_rows = df.groupby("month").size().to_dict()
month_to_number_of_rows

{1: 36571, 2: 33322, 3: 35952}

In [20]:
df.sample(5)

Unnamed: 0,by,descendants,id,kids,score,time,title,type,url,dead,...,kubernetes,linux,windows,solarwinds,garmin,aws,docker,github,wordpress,rundeck
83600,ddtaylor,0.0,26432760,,1.0,1615530625,linting html using css (2017),story,https://bitsofco.de/linting-html-using-css/,,...,False,False,False,False,False,False,False,False,False,False
23034,vinnyglennon,0.0,25846680,,3.0,1611152326,mutations have allowed sarscov2 to become resi...,story,https://www.nicd.ac.za/can-i-be-re-infected-wi...,,...,False,False,False,False,False,False,False,False,False,False
73468,,,26331107,,,1614790614,,story,,1.0,...,False,False,False,False,False,False,False,False,False,False
54542,gshubert17,0.0,26145934,,1.0,1613415801,texas ercot: rotating outages in progress,story,http://www.ercot.com/eea_info/show/26464,,...,False,False,False,False,False,False,False,False,False,False
62818,wilsocr88,0.0,26228491,,1.0,1614019236,"usable, or pretty?",story,https://uxdesign.cc/usable-or-pretty-e95455eb0132,,...,False,False,False,False,False,False,False,False,False,False


In [21]:
technology_prob = []

for technology in technologies:
    
    technology_month_count = df.groupby("month")[technology].apply(np.sum).to_dict()
    
    technology_month_count = {month: count + sommoth_count for (month, count) in technology_month_count.items()}
    technology_month_count = defaultdict(lambda :  sommoth_count,
                                        technology_month_count)
    
    probs = [technology_month_count[month]/total_count for (month, total_count) in month_to_number_of_rows.items()]
    
    technology_prob.append((technology, np.mean(probs)))

In [22]:
technology_prob_df = pd.DataFrame(technology_prob, columns =['technology', 'monthly_prob'])
technology_prob_df

Unnamed: 0,technology,monthly_prob
0,kubernetes,0.003671
1,linux,0.008317
2,windows,0.004583
3,solarwinds,0.001391
4,garmin,5.6e-05
5,aws,0.008233
6,docker,0.00256
7,github,0.007205
8,wordpress,0.001588
9,rundeck,2.8e-05


In [23]:
technology_prob_df.to_parquet("data/technology_prob.parquet")