# This is one of the notebooks I ran in parallel to get the data

## API Key

In [1]:
api_key = '...'

## Imports

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import json
import time
import datetime
import requests
from bs4 import BeautifulSoup

## Set Begin and End Date

In [3]:
begindate = 19810101   # Webpage layout if is different before this date
enddate = 20180301

## Get Metadata through API, add to Articles Dict, and Export to .json

In [5]:
def get_metadata():
    begin_date = 19810101
    end_date = 20160812
    articles = {}
    exportno = 3
    page = 0
    while page < 201 and exportno < 8:
        if len(articles)>=1000:
            print('Exporting results till page {} to json {}'.format(page-1,exportno))
            with open('exports/metadeta_export{}.json'.format(exportno), 'w') as fp:
                json.dump(articles, fp)
            articles = {}
            exportno += 1
            
#         print('Getting articles from page {}'.format(page))
        
        url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
        params = {'api-key': '...',
                  'q': 'mental%20health',
                  'begin_date': begin_date, 
                  'end_date': end_date,
                  'sort':'newest',
                  'page': page}
        response = requests.get(url, params=params)
#         print('status code: {}'.format(response.status_code))
        
        time.sleep(0.6)
        
        while response.status_code != 200:
            print('trying again...')
            time.sleep(3)
            response = requests.get(url, params=params)
            print('status code: {}'.format(response.status_code))
        
        articlesjson = json.loads(response.text)
        
        docs = articlesjson['response']['docs']
        
        try:
            for i in range(0,len(docs)):
                item = docs[i]
                
                # Get URL
                articles[item['web_url']] = {}
                
                # Get Name of Writer
                try:
                    articles[item['web_url']]['writer_name'] = item['byline']['person'][0]['firstname']+' '+item['byline']['person'][0]['lastname']
                except:
                    articles[item['web_url']]['writer_name'] = None

                # Get Publication Date
                try:
                    articles[item['web_url']]['pub_date'] = item['pub_date'][:10]
                    tempdate = ''.join((item['pub_date'][:10]).split('-'))
                except:
                    articles[item['web_url']]['pub_date'] = None
                
                # Get Snippet
                try:
                    articles[item['web_url']]['snippet'] = item['snippet']
                except:
                    articles[item['web_url']]['snippet'] = None
                
                # Get Word Count
                try:
                    articles[item['web_url']]['word_count'] = item['word_count']
                except:
                    articles[item['web_url']]['word_count'] = None
                    
                 # Get Score
                try:
                    articles[item['web_url']]['score'] = item['score']
                except:
                    articles[item['web_url']]['score'] = None
               
                # Get Source
                try:
                    articles[item['web_url']]['source'] = item['source']
                except:
                    articles[item['web_url']]['source'] = None
                    
                # Get Section Name
                try:
                    articles[item['web_url']]['section_name'] = item['section_name']
                except:
                    articles[item['web_url']]['section_name'] = None
               
                # Get Type of Material
                try:
                    articles[item['web_url']]['type_of_material'] = item['type_of_material']
                except:
                    articles[item['web_url']]['type_of_material'] = None
                
                # Get Document Type
                try:
                    articles[item['web_url']]['document_type'] = item['document_type']
                except:
                    articles[item['web_url']]['document_type'] = None
                
                # Get Main / Web Headline
                try:
                    articles[item['web_url']]['main_headline'] = item['headline']['main']
                except:
                    articles[item['web_url']]['main_headline'] = None
               
                # Get Print Headline
                try:
                    articles[item['web_url']]['print_headline'] = item['headline']['print_headline']
                except:
                    articles[item['web_url']]['print_headline'] = None   
                    
            page += 1
            
            if page%10==0 and page>0:
                print('Get articles page {} success'.format(page))
            
        except Exception as e:
            print('ERROR:, {}'.format(e))
            return {}
    
        if page == 200:
            end_date = int(tempdate)
            page = 0
    
    print('Exporting remainder including page {} to json {}'.format(page,exportno))
    with open('exports/metadeta_export{}.json'.format(exportno), 'w') as fp:
        json.dump(articles, fp)

#     return articles

## Get Bodytext Scraping, add to Articles Dict, and Export Full Set to .json

In [6]:
def get_bodytext():
    for i in np.arange(1,9):
        print('{}: importing metadeta_export{}.json'.format(datetime.datetime.now(),i))
        with open('exports/metadeta_export{}.json'.format(i)) as fp:
            articles = json.load(fp)
        for url in articles.keys():
#             print(url)
            page = requests.get(url)
#             time.sleep(0.2)
            soup = BeautifulSoup(page.text, 'lxml')
            text = soup.findAll(attrs={'class':'story-body-text story-content'})
            if text == []:
                text = soup.findAll(attrs={'class':'story-body-text'})
            if text == []:
                text = soup.findAll(attrs={'itemprop':'articleBody'})
            if text == []:
                text = soup.findAll(attrs={'itemprop':'reviewBody'})
            body_text = ''
            for paragraph in text:
                body_text += (' **********'+paragraph.get_text())
            articles[url]['body_text'] = body_text
#             time.sleep(0.6)
        # Write to .json
        print('{}: exporting to bodytext_export{}.json'.format(datetime.datetime.now(),i))
        with open('exports/bodytext_export{}.json'.format(i), 'w') as fp:
            json.dump(articles, fp)