In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os

In [None]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'Towards AI': 'https://pub.towardsai.net/archive/{0}/{1:02d}/{2:02d}',
    'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'Analytics Vidhya': 'https://medium.com/analytics-vidhya/archive/{0}/{1:02d}/{2:02d}',
    'Level Up Coding': 'https://levelup.gitconnected.com/archive/{0}/{1:02d}/{2:02d}',
    'MLearning.ai': 'https://medium.com/mlearning-ai/archive/{0}/{1:02d}/{2:02d}',
}

In [None]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

In [None]:
year = 2021
selected_days = random.sample([i for i in range(1, 367 if is_leap(year) else 366)], 15)

img_dir = 'images'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)

In [None]:
data = []
article_id = 0
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            image = article.find("img", class_="graf-image")
            image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])

1 / 15 ; 2021-04-21
2 / 15 ; 2021-04-09
3 / 15 ; 2021-07-04
4 / 15 ; 2021-04-08
5 / 15 ; 2021-03-12
6 / 15 ; 2021-11-07
7 / 15 ; 2021-03-30
8 / 15 ; 2021-06-23
9 / 15 ; 2021-03-05
10 / 15 ; 2021-01-11
11 / 15 ; 2021-02-12
12 / 15 ; 2021-05-25
13 / 15 ; 2021-10-15
14 / 15 ; 2021-04-12
15 / 15 ; 2021-12-03


In [None]:
medium_df = pd.DataFrame(data, columns=['id', 'url', 'title', 'subtitle', 'image', 'claps', 'responses', 'reading_time', 'publication', 'date'])

In [None]:
medium_df

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/freelancing-sel...,"Freelancing, Self-Learning, and the Importance...",,1.png,288,1,6,Towards Data Science,2021-04-21
1,2,https://towardsdatascience.com/build-and-run-a...,Build and Run a Docker Container for your Mach...,A quick and easy build of a Docker…,2.jpeg,189,2,4,Towards Data Science,2021-04-21
2,3,https://towardsdatascience.com/best-library-to...,Best Library To Simplify Math For Machine Lear...,Using the sympy Python library to simplify…,3.jpeg,310,1,7,Towards Data Science,2021-04-21
3,4,https://towardsdatascience.com/dont-waste-time...,Don’t Waste Time Building Your Data Science Ne...,Focus On What Matters,4.jpg,137,4,5,Towards Data Science,2021-04-21
4,5,https://towardsdatascience.com/the-highest-dat...,The Highest Data Science Salaries,A deep dive into Data Science employment level...,5.jpeg,296,2,5,Towards Data Science,2021-04-21
...,...,...,...,...,...,...,...,...,...,...
941,942,https://levelup.gitconnected.com/build-ai-for-...,Build AI for Generating Quant Trading Strategi...,,942.png,146,0,8,Level Up Coding,2021-04-12
942,943,https://levelup.gitconnected.com/deploy-your-r...,Deploy Your React Application to AWS Using a S...,,943.png,39,0,6,Level Up Coding,2021-04-12
943,944,https://medium.com/mlearning-ai/machine-learni...,Machine Learning/AI Bias,,944.png,9,0,5,MLearning.ai,2021-04-12
944,945,https://medium.com/mlearning-ai/the-heart-of-l...,The heart of logistic regression,,945.jpeg,6,0,9,MLearning.ai,2021-04-12


In [None]:
medium_df.to_csv('medium_data2021.csv', index=False)