# APOD Image Crawler
This is a web crawler developed with scrapy. It queries all APOD image explanations and dates for all the dates in a specific year. The output is saved in JSON format for further analysis.

In [1]:
import json
import scrapy
from scrapy.crawler import CrawlerProcess

from config import api_key

year_dict = {
    1: 31,
    2: 28,
    3: 31,
    4: 30,
    5: 31,
    6: 30,
    7: 31,
    8: 31,
    9: 30,
    10: 31,
    11: 30,
    12: 31
}

year = '2018'
date_string = ''
url_string = ''
    
class APODSpider(scrapy.Spider):
    name = "APOD"
    
    start_urls = []
    
    # Loop through each day of the year
    for month, days in year_dict.items():
        m_string = f'{month}'
        if len(m_string) == 1:
            m_string = f'0{m_string}'
        for d in range(1, days+1):
            d_string = f'{d}'
            if len(d_string) == 1:
                d_string = f'0{d_string}'
            date_string = f'{year}-{m_string}-{d_string}'
            
            # API call to APOD.
            url_string = f'https://api.nasa.gov/planetary/apod?api_key={api_key}&date={date_string}'
            
            # Append URLs list list
            start_urls.append(url_string)
        
    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        
        for item in response.xpath('//p/text()'):
            yield {
                'date': json.loads(item.get())['date'],
                'explanation': json.loads(item.get())['explanation'],
                'media_type': json.loads(item.get())['media_type']
#                 'text': json.loads(item.get())
            }
        
# Configure crawler
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': './data/APOD.json',
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'DavidClarkAPICrawler [dmclark5@gmail.com]',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,   
    'CLOSESPIDER_PAGECOUNT' : 365
})
    
# Start crawler.
process.crawl(APODSpider)
process.start()

In [3]:
import requests

In [4]:
from config import api_key

In [7]:
requests.get(f'https://api.nasa.gov/planetary/apod?api_key={api_key}&date=2019-11-19').json()['explanation']

'Can a lighthouse illuminate a galaxy? No, but in the featured image, gaps in light emanating from the Jose Ignacio Lighthouse in Uruguay appear to match up nicely, although only momentarily and coincidently, with dark dust lanes of our Milky Way Galaxy. The bright dot on the right is the planet Jupiter.  The central band of the Milky Way Galaxy is actually the central spiral disk seen from within the disk. The Milky Way band is not easily visible through city lights but can be quite spectacular to see in dark skies.  The featured picture is actually the addition of ten consecutive images taken by the same camera from the same location.  The images were well planned to exclude direct light from the famous lighthouse.'

In [8]:
data = requests.get('http://quotes.toscrape.com/')

In [11]:
from bs4 import BeautifulSoup

In [12]:
soup = BeautifulSoup(data.text, 'html.parser')

In [18]:
soup.find('span', class_ = "text")

<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>

In [17]:
for quote in soup.find_all('span', class_ = "text"):
    print(quote.text)

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


In [10]:
data.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n        <sp