# Data Gathering for Question 2 

#### Fetching data and creating folder structure where data is dtored

In [1]:
# Import statements
import json, glob, os, re, requests, time
from datetime import date

In [22]:
# NYTimes developer console API key
nyt_archive_key = os.getenv("nyt_key_1")
# print(nyt_archive_key)

# nyt_archive_key = os.getenv("nyt_key_2")
# print(nyt_archive_key)

In [3]:
# Two different apis to fetch data from added in list
apis = ['articlesearch', 'archive']
data_folders_directory = []

In [4]:
# Functions Definitions:
## Function to create folder structure to store data for different apis in json format for question 2
def create_directory_for_data(api):
    current_dir = os.path.dirname('__file__')                                #Relative path for current directory
    data_folder = current_dir
    return create_subfolders_for_data(data_folder, 'data', 'question2', api) 

## Function to create subfolder as per the path specified and api names
def create_subfolders_for_data(data_folder, data, question, api):
    directory =os.path.join(data_folder, data, question, api)
    if not os.path.exists(directory):
        os.makedirs(directory)
        return directory
    else:
        return directory
    
#Function to write data to json file at respective location
def write_to_json_file(file_path, json_data):
    with open(file_path, 'w') as json_out:
        json.dump(json_data, json_out, indent=2)

## Creates a list containing folder paths for both apis
for api in apis:
    data_folder_dir = create_directory_for_data(api)
    data_folders_directory.append(data_folder_dir)

In [9]:
# Function to get the last 6 months with year and month.
# Fetching data for 6 previous months 
def get_year_and_month_range_for_archives():    
    year_count = 0
    range_of_years = []
    mon = 0
    while(year_count < 2):
        row = ''
        
        # Get year in consideration
        year_to_consider = str(date.today().year - year_count)
        row = str(year_to_consider)
        month_count = date.today().month
        # If year is not current year, reset month count to 12 to trace Dec - Jan
        if(year_count>0):
            month_count = 12
        
        # If month is before January, change the year
        while(month_count>0):
            if year_count==1 and mon > 5:         # Limiting the search for 6 months thats why 0-5
                break
            row = str(year_to_consider)+ ','+ str(month_count)
            range_of_years.append(row)
            mon +=1
            month_count -= 1
        year_count +=1
    return range_of_years

In [6]:
#Function to fetch response from api provided
## As both apis have different calling parameters, using if to differentiate between the calls
def fetch_response_from_api(page_count, api, year, month):
    if api == 'articlesearch':
        
        #URL to hit
        url = 'https://api.nytimes.com/svc/search/v2/'+api+'.json'
        
        #Parameters to pass
        payload = {'api-key': nyt_archive_key, 'page': page_count}
        response = requests.get(url,params=payload)
        return response
    elif api == 'archive':
        
        #URL to hit
        url = 'https://api.nytimes.com/svc/'+api+'/v1/'+year+'/'+month+'.json'
        
        #Parameters to pass
        payload = {'api-key': nyt_archive_key}
        response = requests.get(url,params=payload)
        return response 

#Returns response object

In [7]:
# Function to extract only articles from the response file
## Also checking if the article is already present in file or not. If article is present already, don't append it to list of articles
def process_response_from_service(response, api, file_path, already_present_file, page_count):
    if response.status_code == 200:
        res = response.json()
        if os.path.exists(file_path):
            with open(file_path) as fil:
                
                # Get already present file and its content as we will use this multiple times to gather data
                already_present_file = json.load(fil)
                
                # Remove duplicates
                this_response = [artic for artic in res['response']['docs'] if artic['_id'] not in [articles['_id'] for articles in already_present_file]]
                already_present_file.extend(this_response)
        else:
            # If the file is not present, dont check for duplicates. Just write the articles into variable
            already_present_file.extend(res['response']['docs'])
        
        #Write output to json format
        write_to_json_file(file_path, already_present_file)
    else:
        #Show error messages in case an API fails
        if api == 'articlesearch':
            print('Request Failed for article search with status code',response.status_code, 'at Page', page_count)
        elif api == 'archive':
            print('Request Failed for archives with status code',response.status_code)

In [20]:
for data_folder_dir in data_folders_directory:
    page_count = 0
    status_from_json = 200
    
    # Get the name of api from path
    api = data_folder_dir[15:]
    print('API to hit -> ',api)
    
    # Create the name of json file with folder path
    file_name = api+'_response_pages'
    file_path = os.path.join(data_folder_dir, file_name)
    file_path+='.json'

    already_present_file = []

    # Fetch data till we get error from response 
    while(status_from_json == 200):  
        
        # Add time delay between 2 api calls to fetch response without interruption
        time.sleep(1)
        if api == 'archive':
            
            #Get year and month range for past 6 months
            year_range_for_archive = get_year_and_month_range_for_archives()
            for time_to_consider in year_range_for_archive:
                # year and month is used but not page count
                year = time_to_consider[:4]
                month = time_to_consider[5:]
                
                # Fetch response for each year and month for past 6 months
                response = fetch_response_from_api(page_count, api, year, month)
                
                # save the articles only from response into json file 
                process_response_from_service(response, api, file_path, already_present_file, page_count)
                status_from_json = response.status_code
                if status_from_json != 200:
                    break
            status_from_json = 404
        elif api == 'articlesearch':
            # year and month are not used but page count is
            year = 0
            month = 0
            
            # Fetch response for each page
            response = fetch_response_from_api(page_count, api, year, month)
            
            # save the articles only from response into json file 
            process_response_from_service(response, api, file_path, already_present_file, page_count)
            if response.status_code == 200:
                page_count += 1
            status_from_json = response.status_code

API to hit ->  articlesearch
Request Failed for article search with status code 400 at Page 121
API to hit ->  archive


In [23]:
# Code to check how much data is present in each file.
#### No need to run. It will just print the number of articles in each response
for data_folder_dir in data_folders_directory:
    api = data_folder_dir[15:]
    print('API - ',api)
    
    file_name = api+'_response_pages'
    file_path = os.path.join(data_folder_dir, file_name)
    file_path+='.json'
    with open(file_path) as file_to_read:
        present = json.load(file_to_read)
        print('Data Count - ', len(present))

API -  articlesearch
Data Count -  3028
API -  archive
Data Count -  27260
