In [1]:
# import dependencies

from splinter import Browser
from bs4 import BeautifulSoup
import mars_urls as mars

def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "C:\chromedrv\chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)

In [128]:
# Scrape the latest news headline from the NASA Mars Mission
# Path to results working as of May 15, 2019

def scrape_news_article(nasa_url=mars.NASA_URL, nasa_news_url=mars.NASA_NEWS_URL):
    """
    Args:  The URL for the NASA Mars Mission and the URL for the news section.
    Returns the title and first paragraph for the most recent article in the latest news.
    """
    # Initialize and move the browser to the URL
    browser = init_browser()
    browser.visit(nasa_news_url)
    
    # retrieve html and pass to Beautiful Soup for parsing
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    
    # from the main page we can get teaser titles and bodies
    titles = soup.find_all('div', class_="content_title")
    first_story = titles[0]
    first_story_title = first_story.get_text()
    first_story_anchor = first_story.find('a', target="_self")
    first_story_url = first_story_anchor.get('href')
    
    # short story body from the main page:
    bodies = soup.find_all('div', class_='article_teaser_body')
    first_body = bodies[0]
    first_story_oneliner = first_body.get_text()
    
    # Now move the broswer to the full story page and grab the first paragraph.
    first_story_url = nasa_url + first_story_url
    browser.visit(first_story_url)
    
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    
    titles = soup.find_all('h1', class_="article_title")
    first_title = titles[0].get_text().strip()
    
    first_paragraph = soup.find_all('p')[0].get_text()
    
    return first_story_title, first_story_oneliner, first_paragraph

In [129]:
scrape_news_article()

('Why This Martian Full Moon Looks Like Candy',
 "For the first time, NASA's Mars Odyssey orbiter has caught the Martian moon Phobos during a full moon phase. Each color in this new image represents a temperature range detected by Odyssey's infrared camera.",
 "For the first time, NASA's Mars Odyssey orbiter has caught the Martian moon Phobos during a full moon phase. Each color in this new image represents a temperature range detected by Odyssey's infrared camera, which has been studying the Martian moon since September of 2017. Looking like a rainbow-colored jawbreaker, these latest observations could help scientists understand what materials make up Phobos, the larger of Mars' two moons.")

In [130]:
# Scrape the latest featured image from the JPL Mars page
# Path to results working as of May 15, 2019

def scrape_jpl_featured_image(jpl_url=mars.JPL_URL, jpl_mars_url=mars.JPL_MARS_URL):
    """
    Return URL for the current day's featured Mars image.  The input args are the 
    current URLs to the JPL and the JPL's featured Mars image of the day.
    """
    
    # initialize browser
    browser = init_browser()

    # visit the main Mars featured image site.
    browser.visit(jpl_mars_url)
    
    # grab the html and pass to BeautifulSoup for parsing
    jpl_html = browser.html
    jpl_soup = BeautifulSoup(jpl_html, "html.parser")
    
    # As of May 14, 2019, one way to grab the featured image URL is the following:
    # We can find the link to the story for the background image
    story_anchor = jpl_soup.find_all('a', class_="button fancybox")[0] 
    story_url = story_anchor.get('data-link')
    story_url = jpl_url + story_url
    #print(story_url)
    
    # now move the browser to the story page
    browser.visit(story_url)
    story_html = browser.html
    story_soup = BeautifulSoup(story_html, "html.parser")
    
    # on the story page, there is a URL to the full-size image
    full_image_anchor = story_soup.find_all('img', class_="main_image")[0] 
    full_image_url = full_image_anchor.get('src')
    
    return jpl_url + full_image_url

In [131]:
scrape_jpl_featured_image()

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16105_hires.jpg'

In [135]:
mars.MARS_FACTS_URL = "https://space-facts.com/mars/"
import pandas as pd

def scrape_mars_facts(mars_facts_url = mars.MARS_FACTS_URL):
    """
    """
    tables = pd.read_html(mars_facts_url)
    mars_df = tables[0]
    mars_df.columns = ["description", "value"]
    mars_df.set_index("description", inplace=True)
    
    return mars_df

In [136]:
scrape_mars_facts()

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [2]:
browser=init_browser()

In [3]:
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [4]:
browser.visit(url)

In [177]:
# html = browser.html

<div class="item"><a href="/search/map/Mars/Viking/cerberus_enhanced" class="itemLink product-item"><img class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png" alt="Cerberus Hemisphere Enhanced thumbnail"></a><div class="description"><a href="/search/map/Mars/Viking/cerberus_enhanced" class="itemLink product-item"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>

In [178]:
# browser.find_by_tag('a') - too many anchor tags

In [179]:
# browser.find_by_css('a.itemLink.product-item') --  this finds a anchors

In [43]:
links = browser.find_by_css('div.item')
links

[<splinter.driver.webdriver.WebDriverElement at 0x2a87390c828>,
 <splinter.driver.webdriver.WebDriverElement at 0x2a87390cdd8>,
 <splinter.driver.webdriver.WebDriverElement at 0x2a87390c978>,
 <splinter.driver.webdriver.WebDriverElement at 0x2a87390c550>]

In [44]:
link1 = links[0]

In [45]:
link2 = link1.find_by_tag('a').first

In [48]:
# This generates an error message
# link2.click() 

In [53]:
# This just does a mouse-over on the link - doesn't follow it
# browser.find_by_css('div.item').first.double_click()
browser.find_by_css('div.item').first.click()

In [92]:
# So this works - click on each link and then go back...

# A list of hemisphere names
hemispheres = ["Cerberus", "Schiaparelli", "Syrtis Major", "Valles Marineris"]

for hemisphere in hemispheres:
    print(browser.find_link_by_partial_text(hemisphere).first.text)
    browser.click_link_by_partial_text(hemisphere)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    suburl = soup.find_all('img', class_="wide-image")[0].get('src')
    iurl = "https://astrogeology.usgs.gov" + suburl
    print(iurl)
    browser.back()
    

Cerberus Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg
Schiaparelli Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg
Syrtis Major Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg
Valles Marineris Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg


In [60]:
browser.click_link_by_partial_text(hemispheres[0])

In [63]:
# This actually downloads the image to the downloads folder...
browser.click_link_by_partial_text('Original')

In [64]:
html = browser.html

In [65]:
soup = BeautifulSoup(html, "html.parser")

In [73]:
suburl = soup.find_all('img', class_="wide-image")[0].get('src')
suburl

'/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

In [74]:
browser.visit("https://astrogeology.usgs.gov" + suburl)

In [80]:
"https://astrogeology.usgs.gov" + suburl

'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

In [79]:
# Either get the title from this page or get it before clicking.
soup.find_all('title')[0]

<title>Cerberus Hemisphere Enhanced | USGS Astrogeology Science Center</title>

In [83]:
# This also works
browser.find_link_by_partial_text("Cerberus").first.click()

In [85]:
browser.title

'Cerberus Hemisphere Enhanced | USGS Astrogeology Science Center'

In [86]:
browser.back()

In [90]:
# This gets what we are looking for for image title
browser.find_link_by_partial_text("Cerberus").first.text

'Cerberus Hemisphere Enhanced'