- https://selenium-python.readthedocs.io/locating-elements.html
- https://medium.com/tag/data-science/archive

- Sentiment score of headline
- Number of capitals in headline
- use Vader sentiment

In [1471]:
import requests
import re
import time
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

def is_English(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def clean_string(a_string):
    new_string = re.sub(r'‘|’|"|—|“|”|,', '', a_string).strip()
    new_string = re.sub(r'–', ' ', new_string)
    #new_string = re.sub(r'é', 'e', new_string)
    return new_string

def filter_string(a_string):
    if a_string == '.':
        return False
    elif a_string == '':
        return False
    else:
        return True

# Article scraper

To scrape text from individual articles.

In [222]:
urls = ['https://towardsdatascience.com/why-so-many-data-scientists-are-leaving-their-jobs-a1f0329d7ea4?source=tag_archive---------0-----------------------',
       'https://towardsdatascience.com/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6',
       'https://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a',
       'https://towardsdatascience.com/use-scikit-learn-pipelines-to-clean-data-and-train-models-faster-82a5171f50dc']
page_url = urls[2]
driver = webdriver.Chrome()
driver.get(page_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'lxml')

In [1320]:
import re

class ArticleScraper():
    """
    Scrapes all data for an article and returns it in JSON format.
    """
    
    def __init__(self):
        self.scraper_class = 'ArticleScraper' 
    
    def get_title(self, soup):
        try:
            title = soup.find('h1').text
            return title
        
        except:
            print("Couldn't get title from article.")
        
    def get_subtitle(self, soup):
        try:
            subtitle_soup = soup.find('h1').parent.parent.next_sibling.find('h2')
            subtitle = clean_string(subtitle_soup.text)
            return subtitle
        
        except:
            print("Couldn't (or didn't) get subtitle from article.") 
            return 'None'
            
    def get_tags(self, soup):
        try:
            tags = []
            tags = [x.text for x in soup.find_all("a", href=re.compile(".*tag.*"))]
            if len(tags) == 0:
                return 'None'
            else:
                return tags
            
        except:
            print("Couldn't get article tags.")
    
    def get_author(self, soup):
        try:
            author = soup.find('div',style=re.compile(r'flex:.*')).find('a').text
            return author
        except:
            print("Couldn't get author from article.")
            
    def get_h1_headers(self, soup):
        try:
            article_soup = soup.find('article')
            h1_header_soups = article_soup.find_all('h1')
            
            if len(h1_header_soups) == 1:
                return 'None'
            else:
                h1_headers = [clean_string(x.text) for x in h1_header_soups[1:]]
                return h1_headers
        except:
            print("Couldn't get h1 headers.")
            
    
    def get_h2_headers(self, soup):
        try:
            h2_headers = []
            
            article_soup = soup.find('article')
            h2_header_soups = article_soup.find_all('h2')
            
            for header_soup in h2_header_soups:
                if header_soup.text == "Dive in. We'll learn what you like along the way.":
                    continue
                else:
                    h2_headers.append(clean_string(header_soup.text))
            
            if len(h2_headers) == 0:
                return 'None'
            else:
                return h2_headers
        
        except:
            print("Couldn't get h2 headers.")
            
    def get_paragraphs(self, soup):
        try:
            article_soup = soup.find('article')
            paragraphs = [clean_string(x.text) for x in article_soup.find_all('p')]
            return paragraphs
        except:
            print("Couldn't scrape paragraphs.")
            
    def get_blockquotes(self, soup):
        try:
            article_soup = soup.find('article')
            blockquotes = [x.text for x in article_soup.find_all('blockquote')]
            return blockquotes
        except:
            print("Couldn't (or didn't) find blockquotes.")
            return 'None'
            
    def get_bolded(self,soup):
        try:
            article_soup = soup.find('article')

            bolded = [x.text.strip() for x in article_soup.find_all('strong')]
            bolded = [clean_string(x) for x in bolded if filter_string(x)]
            bolded = [x for x in bolded if x]
          
            if len(bolded) == 0:
                return 'None'
            else:
                return bolded
        except:
            print("Couldn't get bolded text.")
    
    def get_italics(self, soup):
        try:
            article_soup = soup.find('article')
            italics = [x.text.strip() for x in article_soup.find_all('em')]
            italics = [clean_string(x) for x in italics if filter_string(x)]
            
            if len(italics) == 0:
                return 'None'
            else:
                return italics
        except:
            print("Couldnt get italicized text.")
            
    def count_bullet_lists(self, soup):
        try:
            article_soup = soup.find('article')
            return len(article_soup.find_all('ul'))
        
        except:
            print("Couldn't count bullet lists.")
            
    def count_numbered_lists(self, soup):
        try:
            article_soup = soup.find('article')
            return len(article_soup.find_all('ol'))
        
        except:
            print("Couldn't count numbered lists.")
            
    def count_figures(self, soup):
        try:
            article_soup = soup.find('article')
            figures = article_soup.find_all('figure')
            return len(figures)
        
        except:
            print("Couldn't count images.")  
            
    def count_gists(self, soup):
        try:
            gists = []
            article_soup = soup.find('article')
            for fig in article_soup.find_all('figure'):
                gist_soup = fig.find('iframe', title=re.compile('.*\.py'))
                if gist_soup == None:
                    continue
                else:
                    gists.append(gist_soup)
                    
            return len(gists)
        
        except:
            print("Couldn't count gists.")
            
    def count_code_chunks(self, soup):
        try:
            article_soup = soup.find('article')
            code_chunk_soups = article_soup.find_all('pre')
            return len(code_chunk_soups)
        
        except:
            print("Couldn't count code chunks.")
            
    def count_vids(self, soup):
        try:
            yt_vids = []
            article_soup = soup.find('article')
            for figure in article_soup.find_all('figure'):
                yt_soup = figure.find('iframe', src=re.compile('.*youtube.*'))
                if yt_soup == None:
                    continue
                else:
                    yt_vids.append(yt_soup)
                    
            return len(yt_vids)
                    
        except:
            print("Couldn't get YouTube videos.")        
            
    def count_links(self, soup):
        try:
            article_soup = soup.find('article')
            link_soups = article_soup.find_all('a', {'target': '_blank'})
            return len(link_soups)
        
        except:
            print("Couldn't count links.")      
            
    def scrape(self, soup):
        article_data = {
            "title": self.get_title(soup),
            "subtitle": self.get_subtitle(soup),
            "tags": self.get_tags(soup),
            "author": self.get_author(soup),
            "h1_headers": self.get_h1_headers(soup),
            "h2_headers": self.get_h2_headers(soup),
            "paragraphs": self.get_paragraphs(soup),
            "blockquotes": self.get_blockquotes(soup),
            "bold_text": self.get_bolded(soup),
            "italic_text": self.get_italics(soup),
            "n_figures": self.count_figures(soup),
            "n_bullet_lists": self.count_bullet_lists(soup),
            "n_numbered_lists": self.count_numbered_lists(soup),
            "n_gists": self.count_gists(soup),
            "n_code_chunks": self.count_code_chunks(soup),
            "n_vids": self.count_vids(soup),
            "n_links": self.count_links(soup),
        }
        
        # if subtitle exists, remove it from h2_headers list
        subtitle = article_data['subtitle']
        if subtitle != 'None':
            article_data['h2_headers'].remove(subtitle)
            
            if len(article_data['h2_headers']) == 0:
                article_data['h2_headers'] = 'None'
                
        return(article_data)

# Archive scraper

In [674]:
#page_url = 'https://medium.com/tag/data-science/archive/2018/01'
page_url = 'https://medium.com/tag/data-science/archive/2018/01/01'
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Chrome()
driver.get(page_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source,'lxml')

In [1364]:
class ArchiveScraper():
    
    def __init__(self):
        self.scraper_class = 'ArchiveScraper' 
    
    def get_authors(self, soup):
        try:
            authors = []
            
            article_cards = soup.find_all('div',{'class': 'postArticle'})
            for card in article_cards:
                authors.append(card.select('a.ds-link')[0].text)
                
            return authors
        
        except:
            print("Couldn't get authors.")
    
    
    def get_publications(self, soup):
        try:
            publications = []
            
            article_cards = soup.find_all('div',{'class': 'postArticle'})
            for card in article_cards:        
                published_by_soup = card.select('a.ds-link')
    
                if len(published_by_soup) == 1:
                    publications.append('None')
                    continue

                publications.append(published_by_soup[1].text)

            return publications
        
        except:
            print("Couldn't get publications.")
    
    
    def get_titles(self, soup):
        try:
            titles = []
            
            article_cards = soup.find_all('div',{'class': 'postArticle'})
            for card in article_cards:
                title_soup = card.find('h3')

                if title_soup is None:
                    title = card.find('p').text
                    titles.append(title)
                else:
                    title = card.find('h3').text
                    titles.append(title)
            
            titles = [x.replace('\xa0',' ').replace('\u200a—\u200a','--').replace('--', ' — ') for x in titles]
            return titles
        
        except:
            print("Couldn't get titles.")
    
    
    def get_read_times(self, soup):
        try:
            read_times = []
            
            read_time_soups = soup.find_all('span', {'class': 'readingTime'})
            read_times = [int(x['title'].replace(" min read","")) for x in read_time_soups]
            
            return read_times
        
        except:
            print("Couldn't get read times.")    
            
            
    def count_responses(self, soup):
        try:
            responses = []

            article_cards = soup.find_all('div', {'class': 'postArticle'})
            for card in article_cards:
                responses_text = card.select('div.buttonSet.u-floatRight')[0].text
                if responses_text == '':
                    responses.append(0)
                else:
                    responses.append(int(responses_text.replace(' responses','').replace(' response','')))
        
            return responses
            
        except:
            print("Couldn't get responses.")
    
    
    def count_claps(self, soup):
        try:
            claps = []

            article_cards = soup.find_all('div', {'class': 'postArticle'})
            for card in article_cards:
                clap_soup = card.find('button',{'data-action':'show-recommends'})

                # no claps
                if clap_soup is None:
                    claps.append(0)
                # >1k claps
                elif 'K' in clap_soup.text:
                    claps.append(int(float(clap_soup.text.replace('K',''))*1000))
                # normal case
                else:
                    claps.append(int(clap_soup.text))
                    
            return claps
        
        except:
            print("Couldn't get claps.")
            
            
    def get_dates(self, soup):
        dates = []
        article_cards = soup.find_all('div',{'class': 'postArticle'})
        for i, card in enumerate(article_cards):
            try:
                dates.append(card.find('span',{'class':'middotDivider'}).previous_element)
            
            except:
                print("Couldn't get this date.")
                dates.append('skip')
                continue
                
        return dates
            
    def get_article_links(self, soup):
        try:
            links = []

            article_cards = soup.find_all('div',{'class': 'postArticle'})
            for card in article_cards:
                links.append(card.find('div', {'class': 'postArticle-content'}).parent['href'])

            return links
        
        except:
            print("Couldn't get article links.")
            
    
    def scrape(self, soup):
        archive_data = {
            'authors': self.get_authors(soup),
            'publications': self.get_publications(soup),
            'titles': self.get_titles(soup),
            'read_times': self.get_read_times(soup),
            'n_responses': self.count_responses(soup),
            'n_claps': self.count_claps(soup),
            'dates': self.get_dates(soup),
            'article_links': self.get_article_links(soup)
        }
        
        return archive_data

##  ---

##  ---

# Log in to Medium

In [1460]:
from selenium.webdriver.common.keys import Keys
from selenium import webdriver

driver = webdriver.Chrome()
start_url = 'https://medium.com/tag/data-science/archive/2018/01/01'
driver.get(start_url)

In [1461]:
# click sign in
driver.find_element_by_css_selector('a.button.js-signInButton').click()
#time.sleep(np.random.normal(2,.3))

In [1462]:
# sign in with Google account
driver.find_element_by_css_selector('button.js-googleButton').click()
time.sleep(np.random.normal(4,.3))

In [1463]:
# enter email
username_form = driver.find_element_by_name('identifier')
email = '<yourmediumemail>'
username_form.send_keys(email)
time.sleep(np.random.normal(1.35, .17))
username_form.send_keys(Keys.RETURN)

In [1464]:
# enter password
time.sleep(np.random.normal(1.7, .3))
password_form = driver.find_element_by_name('password')
password = '<yourMediumpassword>'
password_form.send_keys(password)
password_form.send_keys(Keys.RETURN)

# Define scraper functions and DataFrame for saving data

In [1454]:
def scrape(soup, scraper):
    data = scraper.scrape(soup)
    
    if scraper.scraper_class == 'ArchiveScraper':
        return_df = pd.DataFrame.from_dict(data,orient='index').transpose()
    else:
        return_df = pd.DataFrame(columns = list(data.keys()))
        return_df = return_df.append(data, ignore_index=True,)
        
    return return_df

def scrape_day(soup):
    archive_scraper = ArchiveScraper()
    article_scraper = ArticleScraper()

    archive_day_df = pd.DataFrame(columns = ['authors', 'publications', 'titles', 'read_times', 'n_responses', 'n_claps', 'dates', 'article_links'])
    article_day_df = pd.DataFrame(columns = ['title', 'subtitle','tags', 'author', 'h1_headers', 'h2_headers', 'paragraphs', 'blockquotes', 'bold_text', 'italic_text', 'n_figures', 'n_bullet_lists', 'n_numbered_lists', 'n_gists', 'n_code_chunks', 'n_vids', 'n_links'])

    # scrape archive first
    archive_day_df = archive_data_df.append(scrape(soup, archive_scraper), ignore_index=True)
    archive_day_df.rename(columns= {
        'authors':'author',
        'publications':'publication',
        'titles':'title',
        'read_times':'read_time',
        'article_links':'article_link',
        'dates':'date'
    }, inplace=True)
    
    # then scrape articles, scraped archive data
    links = archive_day_df['article_link']
    for link in links[:]:
        driver.get(link)
        time.sleep(.081)
        
        soup = BeautifulSoup(driver.page_source, 'lxml')
        article_day_df = article_day_df.append(scrape(soup, article_scraper), ignore_index=True)
        time.sleep(np.random.normal(2,.4))
        
    print("All done!")    
    return archive_day_df, article_day_df

# --- 

# Scrape a day's data

In [1069]:
driver.get('https://medium.com/tag/data-science/archive/2018/01/01')
time.sleep(np.random.normal(3.5,.5))
soup = BeautifulSoup(driver.page_source, 'lxml')

try:
    archive_day_df, article_day_df = scrape_day(soup)
except:
    print("Couldn't scrape this day.")

In [1072]:
archive_day_df.to_csv('archive_test_day.csv', index=False, encoding='utf-8-sig')
article_day_df.to_csv('article_test_day.csv', index=False, encoding='utf-8-sig')

In [1074]:
joined = pd.merge(article_day_df, archive_day_df[['title','publication', 'read_time', 'n_responses', 'n_claps', 'date']], 
         'inner', on = 'title').to_csv('joined.csv',index=False,encoding='utf-8-sig')

# ---

# Scrape a month's data 

Need a function to generate archive URLs.

In [1444]:
def generate_dates(year, month, end_day):
    dates = []
    
    days = list(np.arange(1, end_day+1))
    for d in days:        
        dates.append(str(year) + '/' + str(month).zfill(2) + '/' + str(d).zfill(2))
        
    return dates

def generate_urls(base_url, dates):
    archive_urls = [base_url + d for d in dates]
    return archive_urls
    

In [1076]:
urls_2018 = []
base_url = 'https://medium.com/tag/data-science/archive/'
end_days = [31, 28, 30, 30, 31, 29, 31, 31, 28, 31, 30, 31]

for i, month_end in enumerate(end_days):
    month = i + 1
    month_dates = generate_dates(2018, month, month_end)
    month_urls = generate_urls(base_url, month_dates)
    urls_2018.append(month_urls)
    
urls_2018;

`urls_2018[0]` contains all dates in January. `urls_2018[0]` contains all dates in February.

#### Scrape a month's worth of data

In [1445]:
len(urls_2018)

12

In [1474]:
# ONLY RUN THIS IF YOU'RE OKAY RESETTING THE MONTH'S DATA!
# DataFrame to save to for the month
month_urls = urls_2018[4]
archive_month_df = pd.DataFrame(columns = ['author', 'publication', 'title', 'read_time', 'n_responses', 'n_claps', 'date', 'article_link'])
article_month_df = pd.DataFrame(columns = ['title', 'subtitle','tags', 'author', 'h1_headers', 'h2_headers', 'paragraphs', 'blockquotes', 'bold_text', 'italic_text', 'n_figures', 'n_bullet_lists', 'n_numbered_lists', 'n_gists', 'n_code_chunks', 'n_vids', 'n_links'])

In [1475]:
month_urls[0]

'https://medium.com/tag/data-science/archive/2018/05/01'

In [1476]:
for month_url in month_urls[:]:
    print(month_url)
    driver.get(month_url)
    time.sleep(np.random.normal(2,.3))
    soup = BeautifulSoup(driver.page_source, 'lxml')

    try:
        archive_day_df, article_day_df = scrape_day(soup)
    except:
        print(f"Couldn't scrape for this day: {month_url}.")
        
    archive_month_df = archive_month_df.append(archive_day_df, ignore_index=True, sort=False)
    article_month_df = article_month_df.append(article_day_df, ignore_index=True, sort=False)

print('Finished scraping the month!')

https://medium.com/tag/data-science/archive/2018/05/01
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.


Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
All done!
https://medium.com/tag/data-science/archive/2018/05/05
Couldn't get this date.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from a

Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
All done!
https://medium.com/tag/data-science/archive/2018/05/12
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
All done!
https://medium.com/tag/data-science/archive/2018/05/16
Couldn't get this date.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn'

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn'

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) g

Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't get title from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
Couldn't (or didn't) get subtitle from article.
All done!
https://medium.com/tag/data-science/archive/2018/05/31
Couldn't get this date.
Couldn't (or didn't) get subtitle from artic

In [1477]:
article_month_df_unique = article_month_df.drop_duplicates(['title','author'])
archive_month_df_unique = archive_month_df.drop_duplicates(['title','author'])
month_df = pd.merge(article_month_df_unique, 
                    archive_month_df_unique[['title','publication', 'read_time', 'n_responses', 'n_claps', 'date']], 
                    how='inner', on = 'title')

#### Save!!

In [1478]:
month_df.shape

(1389, 22)

In [1479]:
month_df.to_csv("may_2018_data.csv",index=False,encoding='utf-8-sig')
archive_month_df.to_csv('Data/archives_may_2018.csv', index=False)
article_month_df.to_csv('Data/articles_may_2018.csv', index=False)

# ------------------------------------------ 

# ------------------------------------------ 