In [1]:
!pip list | grep soup

beautifulsoup4                4.6.3


In [50]:
import requests
import pandas as pd
import dateutil

from tqdm import tqdm
from time import sleep
from bs4 import BeautifulSoup

In [51]:
def scrape_creation_date(subreddit):
  """
  Go visit the given subreddit
  return the date the subreddit was created
  like yyyy-mm-dd
  """
  headers = {
      'User-Agent': 'Mozilla/5.0'
  }

  url = 'https://www.reddit.com/r/' + subreddit

  r = requests.get(url, headers=headers)

  if r.status_code != 200:
    raise ValueError('status code is ' + r.status_code)

  soup = BeautifulSoup(r.text)
  # This is the class of the div that holds the creation date
  # as of September 26, 2021
  
  cake_attr = {'class':'_2QZ7T4uAFMs_N83BZcN-Em'}
  cake_div = soup.find_all(attrs=cake_attr)
  if cake_div == []:
    return 'Unable to locate cake div'
  cake_txt = cake_div[0].getText().replace('Created ','')
  return dateutil.parser.parse(cake_txt).strftime('%Y-%m-%d')






In [57]:
df = pd.read_csv('extra_clean.csv')
subs = df['subreddit']

cakes = {}

for sub in tqdm(subs, ascii=True):
  try:
    cakes[sub] = scrape_creation_date(sub)
    sleep(1)
  except ValueError:
    sleep(2)
    cakes[sub] = scrape_creation_date(sub)
  except:
    cakes[sub] = 'Some other error' 

100%|##########| 3679/3679 [3:38:11<00:00,  3.56s/it]


In [58]:
len(cakes.keys())

3679

In [63]:
errors = 0
for k, v in cakes.items():
  if v == 'Some other error' or v == 'Unable to locate cake div':
    errors += 1
errors

31

In [77]:
new_df = pd.DataFrame.from_dict(cakes, orient='index', columns=['creation_date'])
new_df = new_df.reset_index()
new_df[['subreddit', 'creation_date']] = new_df[['index', 'creation_date']]
new_df = new_df.drop(columns=['index'])
new_df.head()

Unnamed: 0,creation_date,subreddit
0,2013-01-16,changemyview
1,2012-06-05,Terraform
2,2015-04-07,lostpause
3,2009-07-12,USPS
4,2016-01-04,MaliciousCompliance


In [78]:
join_df = pd.merge(df, new_df, how='inner', on='subreddit')
join_df.head()

Unnamed: 0,subreddit,subscribers,current_users,accessed,creation_date
0,changemyview,1349762,1272,1632083000.0,2013-01-16
1,Terraform,19623,38,1632083000.0,2012-06-05
2,lostpause,78056,255,1632083000.0,2015-04-07
3,USPS,69497,568,1632083000.0,2009-07-12
4,MaliciousCompliance,1664837,5484,1632083000.0,2016-01-04


In [79]:
join_df.to_csv('plus_creation_date.csv')