# APOD Image Crawler
This is a web crawler developed with scrapy. It queries all APOD image explanations and dates for all the dates in a specific year. The output is saved in JSON format for further analysis.

In [1]:
import json
import scrapy
from scrapy.crawler import CrawlerProcess

from config import api_key

year_dict = {
    1: 31,
    2: 28,
    3: 31,
    4: 30,
    5: 31,
    6: 30,
    7: 31,
    8: 31,
    9: 30,
    10: 31,
    11: 30,
    12: 31
}

year = '2018'
date_string = ''
url_string = ''
    
class APODSpider(scrapy.Spider):
    name = "APOD"
    
    start_urls = []
    
    # Loop through each day of the year
    for month, days in year_dict.items():
        m_string = f'{month}'
        if len(m_string) == 1:
            m_string = f'0{m_string}'
        for d in range(1, days+1):
            d_string = f'{d}'
            if len(d_string) == 1:
                d_string = f'0{d_string}'
            date_string = f'{year}-{m_string}-{d_string}'
            
            # API call to APOD.
            url_string = f'https://api.nasa.gov/planetary/apod?api_key={api_key}&date={date_string}'
            
            # Append URLs list list
            start_urls.append(url_string)
        
    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        
        for item in response.xpath('//p/text()'):
            yield {
                'date': json.loads(item.get())['date'],
                'explanation': json.loads(item.get())['explanation'],
                'media_type': json.loads(item.get())['media_type']
#                 'text': json.loads(item.get())
            }
        
# Configure crawler
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': './data/APOD.json',
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'DavidClarkAPICrawler [dmclark5@gmail.com]',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,   
    'CLOSESPIDER_PAGECOUNT' : 365
})
    
# Start crawler.
process.crawl(APODSpider)
process.start()