Scrape the website Towards Data Science using Beautiful Soup https://dorianlazar.medium.com/scraping-medium-with-python-beautiful-soup-3314f898bbf5

In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import random

In [7]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}'
}

In [8]:
def convert_day(day):
    month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

In [9]:
def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

In [35]:
#https://hackernoon.com/how-to-scrape-a-medium-publication-a-python-tutorial-for-beginners-o8u3t69
def get_article_text(story_url):
    
    story_page = requests.get(story_url)
    story_soup = BeautifulSoup(story_page.text, 'html.parser')

    sections = story_soup.find_all('section')
    story_paragraphs = []
    section_titles = []
    
    for section in sections:
        paragraphs = section.find_all('p')
        for paragraph in paragraphs:
            story_paragraphs.append(paragraph.text)

        subs = section.find_all('h1')
        for sub in subs:
            section_titles.append(sub.text)

    number_sections = len(section_titles)
    number_paragraphs = len(story_paragraphs)
    section_title_text = " ".join(section_titles)
    story_text = " ".join(story_paragraphs)
    
    return number_sections, number_paragraphs, section_titles, story_text

In [20]:
selected_days = random.sample([i for i in range(1, 366)], 5)

In [21]:
data = []
article_id = 0
years = range(2015,2023)
i = 0
n = len(selected_days)
for year in years:
    for d in selected_days:
        i += 1
        month, day = convert_day(d)
        date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
        print(f'{i} / {n} ; {date}')
        for publication, url in urls.items():
            response = requests.get(url.format(year, month, day), allow_redirects=True)
            if not response.url.startswith(url.format(year, month, day)):
                continue
            page = response.content
            soup = BeautifulSoup(page, 'html.parser')
            articles = soup.find_all(
                "div",
                class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
            for article in articles:
                title = article.find("h3", class_="graf--title")
                if title is None:
                    continue
                title = title.contents[0]
                article_id += 1
                subtitle = article.find("h4", class_="graf--subtitle")
                subtitle = subtitle.contents[0] if subtitle is not None else ''
                #image = article.find("img", class_="graf-image")
                #image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
                article_url = article.find_all("a")[3]['href'].split('?')[0]
                number_sections, number_paragraphs, section_titles, story_text = get_article_text(article_url)
                buttons = article.find_all("button")
                claps = get_claps(buttons[1].contents[0]) if len(buttons) > 0 else None
                reading_time = article.find("span", class_="readingTime")
                reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
                responses = article.find_all("a")
                if len(responses) == 7:
                    responses = responses[6].contents[0].split(' ')
                    if len(responses) == 0:
                        responses = 0
                    else:
                        responses = responses[0]
                else:
                    responses = 0

                data.append([article_id, article_url, title, subtitle,
                             number_sections, number_paragraphs, section_titles, story_text,
                             claps, responses,
                             reading_time, publication, date,year])

1 / 5 ; 2015-06-28
2 / 5 ; 2015-10-01
3 / 5 ; 2015-06-10
4 / 5 ; 2015-01-09
5 / 5 ; 2015-02-01
6 / 5 ; 2016-06-28
7 / 5 ; 2016-10-01
8 / 5 ; 2016-06-10
9 / 5 ; 2016-01-09
10 / 5 ; 2016-02-01
11 / 5 ; 2017-06-28
12 / 5 ; 2017-10-01
13 / 5 ; 2017-06-10
14 / 5 ; 2017-01-09
15 / 5 ; 2017-02-01
16 / 5 ; 2018-06-28
17 / 5 ; 2018-10-01
18 / 5 ; 2018-06-10
19 / 5 ; 2018-01-09
20 / 5 ; 2018-02-01
21 / 5 ; 2019-06-28
22 / 5 ; 2019-10-01
23 / 5 ; 2019-06-10
24 / 5 ; 2019-01-09
25 / 5 ; 2019-02-01
26 / 5 ; 2020-06-28
27 / 5 ; 2020-10-01
28 / 5 ; 2020-06-10
29 / 5 ; 2020-01-09
30 / 5 ; 2020-02-01
31 / 5 ; 2021-06-28
32 / 5 ; 2021-10-01
33 / 5 ; 2021-06-10
34 / 5 ; 2021-01-09
35 / 5 ; 2021-02-01
36 / 5 ; 2022-06-28
37 / 5 ; 2022-10-01
38 / 5 ; 2022-06-10
39 / 5 ; 2022-01-09
40 / 5 ; 2022-02-01


In [22]:
medium_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle',
    'n_sections', 'n_paragraphs', 'section_titles', 'story_text',
    'claps', 'responses',
    'reading_time', 'publication', 'date','year'])

In [23]:
medium_df

Unnamed: 0,id,url,title,subtitle,claps,responses,reading_time,publication,date,year
0,1,https://towardsdatascience.com/batch-normaliza...,Batch Normalization,,162.0,1,3,Towards Data Science,2017-06-28,2017
1,2,https://towardsdatascience.com/gans-part2-dcga...,GANS — PART2: DCGANs (deep convolution GANS) f...,,,0,2,Towards Data Science,2017-06-28,2017
2,3,https://towardsdatascience.com/identifying-tra...,Identifying Traffic Signs with Deep Learning,,111.0,1,4,Towards Data Science,2017-06-28,2017
3,4,https://towardsdatascience.com/hacking-data-ar...,Hacking Data Art at an AI Genomic Hackathon,,,0,7,Towards Data Science,2017-06-28,2017
4,5,https://towardsdatascience.com/common-sense-an...,Common Sense Analytics,,6.0,0,3,Towards Data Science,2017-06-28,2017
...,...,...,...,...,...,...,...,...,...,...
790,791,https://towardsdatascience.com/what-you-love-t...,What You Love to Ignore in Your Data Science P...,The only truly secure system is one that is…,102.0,1,5,Towards Data Science,2022-02-01,2022
791,792,https://towardsdatascience.com/double-blind-co...,Double-Blind Coffee Studies,A discussion on why I don’t do them,4.0,0,6,Towards Data Science,2022-02-01,2022
792,793,https://towardsdatascience.com/why-is-automati...,Why is Automatically Generating Code Hard?,Challenges in automatically generating code wi...,44.0,1,7,Towards Data Science,2022-02-01,2022
793,794,https://towardsdatascience.com/essential-guide...,Essential Guide to R&D in Machine Learning: Mo...,Essential lessons for each step of an ML…,25.0,1,6,Towards Data Science,2022-02-01,2022


In [24]:
medium_df.to_csv("tds.csv",index=True)

In [25]:
requests.get(article_url)

<Response [200]>

In [26]:
page = response.content
soup = BeautifulSoup(page, 'html.parser')

In [27]:
page

b'<!DOCTYPE html><html xmlns:cc="http://creativecommons.org/ns#"><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# medium-com: http://ogp.me/ns/fb/medium-com#"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=contain"><title>All stories published by Towards Data Science on February 01, 2022</title><link rel="canonical" href="https://towardsdatascience.com/archive/2022/02/01"><meta name="robots" content="index,follow"><meta name="title" content="All stories published by Towards Data Science on February 01, 2022"><meta name="referrer" content="unsafe-url"><meta name="description" content="Read all stories published by Towards Data Science on February 01, 2022. Your home for data science. A Medium publication sharing concepts, ideas and codes."><meta name="theme-color" content="#000000"><meta property="og:title" content="All stories published by Towards Data Science on February 

In [33]:
get_article_text("https://towardsdatascience.com/web-scraping-with-python-beautifulsoup-40d2ce4b6252")

(4,
 34,
 ['Web scraping with Python & BeautifulSoup',
  'Installing the libraries',
  'Using requests & beautiful soup to extract data',
  'Web scraping example: get top 10 linux distros'],
 "The web contains lots of data. The ability to extract the information you need from it is, with no doubt, a useful one, even necessary. Of course, there are still lots of datasets already available for you to download, on places like Kaggle, but in many cases, you won’t find the exact data that you need for your particular problem. However, chances are you’ll find what you need somewhere on the web and you’ll need to extract it from there. Web scraping is the process of doing this, of extracting data from web pages. In this article, we’ll see how to do web scraping in python. For this task, there are several libraries that you can use. Among these, here we will use Beautiful Soup 4. This library takes care of extracting data from a HTML document, not downloading it. For downloading web pages, we 