# Scraping

### NASA Mars News

* [NASA Mars News Site](https://mars.nasa.gov/news/)

* Collect the latest News Title and Paragraph Text

### JPL Mars Space Images - Featured Image

* [JPL Featured Space Image](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)

* Find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`. Make sure to find the image url to the full size `.jpg` image and complete url string for this image.

### Mars Facts

* [Mars Facts webpage](https://space-facts.com/mars/)

* Find the table containing facts about the planet including Diameter, Mass, etc.

### Mars Hemispheres

* [USGS Astrogeology site](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)

* Obtain high resolution images for each of Mar's hemispheres - find the image url to the full resolution image

In [111]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
import requests
import time

In [6]:
# Initialize browser
def init_browser():
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

## NASA Mars News

In [56]:
def scrape_news():
    browser = init_browser()
    
    url = "https://mars.nasa.gov/news/"
    
    browser.visit(url)

    time.sleep(2)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Get the latest news title and paragraph text
    article = soup.find('div', class_='list_text')
    news_title = article.find('a').text
    news_p = article.find('div', class_='article_teaser_body').text
    
    browser.quit()
    
    return {'title' : news_title, 'text' : news_p}

## JPL Mars Space Images - Featured Image

In [62]:
def scrape_JPL_image():
    browser = init_browser()
    
    # Visit the url for JPL Featured Space Image
    nasa_url = "https://www.jpl.nasa.gov"
    jpl_query = "/spaceimages/?search=&category=Mars"
    browser.visit(nasa_url+jpl_query)

    time.sleep(1)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Find the image url for the current Featured Mars Image
    article = soup.find('div', class_='carousel_items').find('article')
    featured_image_url = article['style'].split("'")[1]
    
    browser.quit()

    return nasa_url+featured_image_url

## Mars Facts

In [73]:
def scrape_mars_facts():
    browser = init_browser()
    
    # Visit the url for JPL Featured Space Image
    url = "https://space-facts.com/mars/"
    browser.visit(url)

    time.sleep(1)
    
    # Read html using pandas
    html = browser.html
    tables = pd.read_html(html)
    
    # Scrape the table containing facts about the planet including Diameter, Mass, etc.
    df = pd.DataFrame(tables[0])
    table_html = df.to_html(index=False, border=1, header=False,
                            classes=["table", "table-responsive", "table-striped"], 
                            justify='center')

    browser.quit()

    return table_html

## Mars Hemispheres

In [109]:
def scrape_mars_hemispheres():
    browser = init_browser()
    
    # Visit the USGS Astrogeology site
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    time.sleep(1)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Find all items
    items = soup.find_all('div', class_='item')
    
    # Initialize a list
    hemisphere_image_urls = []
    
    # Find the titles and image urls for the Hemispheres
    usgs_url = "https://astrogeology.usgs.gov"
    for item in items:
        # Find title of this item
        title = item.find('h3').text
        
        # Initialize a dictionary
        hemisphere = {}
        
        # Find the url where this item is explained in detail
        item_url = item.find('a')['href']
        
        # Scrape the item's url to find image link and title
        #response = requests.get(usgs_url+item_url)
        #soup2 = bs(response.text, "html.parser")
        
        # Click the link of each item
        browser.find_by_text(title).click()
        soup2 = bs(browser.html, "html.parser")
        
        # Find the link for the full size image - the first link is for jpg, and the 2nd is tif(full size)
        imgs = soup2.find('div', class_="downloads").find_all('a')
        jpg_url = imgs[0]['href']
        
        # Add the img url to the dictionary
        hemisphere['title'] = title
        hemisphere['img_url'] = jpg_url
        
        # Append the dictionary to the list
        hemisphere_image_urls.append(hemisphere)
        
        # Back to the USGS Astrogeology site
        browser.back()
        
        # Quit the browser
    browser.quit()

    return hemisphere_image_urls

In [110]:
scrape_mars_hemispheres()

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]