# Implementation

In [5]:
import requests
import json
import urllib

In [25]:
def get_obj(response):
    """ Extracts the response object from the raw response string """
    content = response.text[response.text.index('{'):]
    obj = json.loads(content)
    return obj

In [62]:
def retrieve_article_content(domain, slug):
    url = 'https://' + domain + '/' + slug + '?format=json'
    response = requests.get(url=url)
    obj = get_obj(response)
    text = []
    if not 'value' in obj['payload']:
        return ''
    paragraphs_list = obj['payload']['value']['content']['bodyModel']['paragraphs']
    
    for paragraph in paragraphs_list:
        if 'text' in paragraph:
            text.append(paragraph['text'])
    
    return text

In [58]:
def perform_first_query(query):
    # Perform first request for the first 10 items
    url = 'https://medium.com/search?q='+ query +'&format=json'
    response = requests.get(url=url)
    obj = get_obj(response)
    text = []
    posts_list = obj['payload']['value']['posts']
    
    for post in posts_list:
        if 'homeCollection' in post and 'domain' in post['homeCollection']:
            domain = post['homeCollection']['domain']
        else:
            domain = 'medium.com/@'+ post['creator']['username']
        slug = post['uniqueSlug']
        text.append(retrieve_article_content(domain, slug))
        
    next_path = obj['payload']['paging']['path']
    next_params = obj['payload']['paging']['next']
    next_cookies = response.cookies
    return text, next_path, next_params, next_cookies

In [59]:
def perform_followup_query(path, params, cookies):
    # Perform first request for the first 10 items
    url = 'https://medium.com' + path + '&format=json'
    response = requests.get(url=url, params=params, cookies=cookies)
    obj = get_obj(response)
    text = []
    posts_list = obj['payload']['value']
    
    for post in posts_list:
        if 'homeCollection' in post and 'domain' in post['homeCollection']:
            domain = post['homeCollection']['domain']
        else:
            domain = 'medium.com/@'+ post['creator']['username']
        slug = post['uniqueSlug']
        text.append(retrieve_article_content(domain, slug))
    
    next_path = obj['payload']['paging']['path']
    next_params = obj['payload']['paging']['next']
    next_cookies = response.cookies
    return text, next_path, next_params, next_cookies

In [60]:
def query(query, n_pages=1):
    """
    Function to retrieve an array of text articles from Medium, given the query text and 
    number of pages (10 articles per page)
    """
    
    # Return an empty array if no page is requested
    if n_pages < 1:
        return []
    
    
    query = urllib.parse.quote(query)
    
    # Perform first request for the first 10 items
    texts, next_path, next_params, next_cookies = perform_first_query(query)
    
    for _ in range(1, n_pages):
        #return perform_followup_query(next_path, next_params)
        new_texts, next_path, next_params, next_cookies = perform_followup_query(next_path, next_params, next_cookies)
        texts.extend(new_texts)
        
    return texts

In [92]:
res = query('insurance', 90)

In [93]:
with open('data2.txt', 'wb') as f:
    for article in res:
        for paragraph in article:
            f.write(paragraph.encode('utf8'))
            f.write('\n'.encode('utf8'))
        f.write('\n'.encode('utf8'))

In [94]:
import codecs

In [95]:
with codecs.open('data2.txt', encoding='utf-8') as f:
    text = f.read()

In [99]:
len(text)

3138662

In [97]:
len(text)*8 / 1000000

25.109296