In [1]:
# Homework 13 - Web Scraping

# Sites to Scrape:
# 1) NASA Mars News Site: https://mars.nasa.gov/news/
# 2) PL Mars Space Images - Featured Image: https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
# 3) Mars Weather (Twitter): https://twitter.com/marswxreport?lang=en
# 4) Mars Facts: https://space-facts.com/mars/
# 5) Mars Hemispheres: https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars

In [2]:
# Import Dependencies
import time
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys

In [3]:
def init_browser():
    executable_path = {"executable_path": "C:/Users/chris/Desktop/UCI Apps/chromedriver"}
    return Browser("chrome", **executable_path, headless=True)

In [4]:
# 1) NASA Mars News Site: https://mars.nasa.gov/news/
# -----------------------------------------------------------------------------------------------------------
# Scrape the NASA Mars News Site and collect the latest News Title and Paragragh Text.
# Assign the text to variables that you can reference later.
# Ex: news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet"
# Ex: news_p = "Preparation of NASA's next spacecraft to Mars, InSight, has ramped up this summer, on course for launch next May from Vandenberg Air Force Base in central California -- the first interplanetary launch in history from America's West Coast."
# -----------------------------------------------------------------------------------------------------------
def news_scrape():
    browser = init_browser()

    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    time.sleep(1)

    #requests = requests.get(url)
    
    html = browser.html
    news_soup = BeautifulSoup(html, "html.parser")

    news_title = news_soup.find("div", class_="content_title").get_text()
    news_par = news_soup.find("div", class_="article_teaser_body").get_text()
    
    news_dict = {"title": news_title,
                "par": news_par}
    
    return news_dict

In [15]:
# 2) PL Mars Space Images - Featured Image: https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
# -----------------------------------------------------------------------------------------------------------
# Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign 
# the url string to a variable called featured_image_url.
# Make sure to find the image url to the full size .jpg image.
# Make sure to save a complete url string for this image.
# Ex: featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'
# -----------------------------------------------------------------------------------------------------------
def img_scrape():    
    browser = init_browser()

    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(1)

    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(5)
    browser.click_link_by_partial_text("more info")

    html = browser.html
    img_soup = BeautifulSoup(html, "html.parser")

    partial_url = img_soup.find("figure").find("img")["src"]
    img_url = str("https://www.jpl.nasa.gov" + partial_url)
    
    return img_url

In [16]:
# 3) Mars Weather (Twitter): https://twitter.com/marswxreport?lang=en
# -----------------------------------------------------------------------------------------------------------
# Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
# Save the tweet text for the weather report as a variable called mars_weather.
# Ex: mars_weather = 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55'
# -----------------------------------------------------------------------------------------------------------
def twitter_scrape():    
    browser = init_browser()

    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)
    time.sleep(1)

    html = browser.html
    twitter_soup = BeautifulSoup(html, "html.parser")
    
    mars_weather = twitter_soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").get_text()    
    
    return mars_weather

In [7]:
# 4) Mars Facts: https://space-facts.com/mars/
# -----------------------------------------------------------------------------------------------------------
# Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet 
# including Diameter, Mass, etc.
# Use Pandas to convert the data to an HTML table string.
# -----------------------------------------------------------------------------------------------------------
def facts_scrape():    
    browser = init_browser()

    url = "https://space-facts.com/mars/"
    browser.visit(url)
    time.sleep(1)

    html = browser.html
    facts_soup = BeautifulSoup(html, "html.parser")
    
    #Find all table headers (a.k.a. column 1):
    col1 = facts_soup.find_all("td", class_="column-1")

    tbl_headers = []
    for i in range(0, len(col1)-1):
        text = col1[i].get_text()
        tbl_headers.append(text)
    
    #Find all table data (a.k.a. column 2):
    col2 = facts_soup.find_all("td", class_="column-2")

    tbl_data = []
    for i in range(0, len(col2)-1):
        text = col1[i].get_text()
        tbl_data.append(text)

    #Merge lists into a to pandas dataframe
    df = pd.DataFrame({"Fact": tbl_headers,
                               "Value": tbl_data})
    
    #Convert dataframe to HTML table; align text to the left
    mars_facts = df.to_html().replace('<tr style="text-align: right;">','<tr style="text-align: left;">')

    return mars_facts


In [8]:
# 5) Mars Hemispheres: https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
# -----------------------------------------------------------------------------------------------------------
# Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
# You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
# Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name.
# Use a Python dictionary to store the data using the keys img_url and title.
# Append the dictionary with the image url string and the hemisphere title to a list.
# This list will contain one dictionary for each hemisphere.
# Example:
# hemisphere_image_urls = [
#    {"title": "Valles Marineris Hemisphere", "img_url": "..."},
#    {"title": "Cerberus Hemisphere", "img_url": "..."},
#    {"title": "Schiaparelli Hemisphere", "img_url": "..."},
#    {"title": "Syrtis Major Hemisphere", "img_url": "..."},
#]
# -----------------------------------------------------------------------------------------------------------
def hemi_scrape():    
    browser = init_browser()

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(1)

    html1 = browser.html
    
    #Find names of hemispheres and put them in a list
    hemi_soup = BeautifulSoup(html1, "html.parser")
    headers = hemi_soup.find_all("h3")

    hemisphere_image_urls = []

    for i in range(0, (len(headers))):        
        #Get hemisphere names
        hemi = str(headers[i]).split("<h3>")[1]
        name = hemi[:(len(hemi)-14)]
        
        #Use hemisphere name to click on link and get image URL
        browser.click_link_by_partial_text(name)
        img_soup = BeautifulSoup(broswer.html, "html.parser")
        hemi_url = img_soup.find("div", class_="downloads").find("a")["href"]
        
        #Add the hemisphere name and img url to a dictionary
        temp_dict = {"title": name, "img_url": hemi_url}
        
        #Append the temp_dict to the list
        hemisphere_image_urls.append(temp_dict)
                
        #tell the browser to hit the back button so it can get the next image URL
        browser.back()

    return hemisphere_image_urls


In [10]:
news_dict = news_scrape()
news_dict

{'par': 'Project could help spacecraft keep time more efficiently and allow ground stations to better track multiple satellites at once near crowded areas like Mars.',
 'title': 'NASA Tests Atomic Clock for Deep Space Navigation'}

In [19]:
#featured_img_dict = {"featured_image": img_scrape()}
#featured_img_dict

In [17]:
weather_dict = {"weather": twitter_scrape()}
weather_dict

{'weather': 'Sol 1955 (Feb 04, 2018), Sunny, high -21C/-5F, low -77C/-106F, pressure at 7.45 hPa, daylight 05:41-17:27'}

In [21]:
facts_dict = facts_scrape()
facts_dict

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: left;">\n      <th></th>\n      <th>Fact</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>Equatorial Diameter:</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>Polar Diameter:</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>Mass:</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>Moons:</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>Orbit Distance:</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>Orbit Period:</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>Surface Temperature:</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>First Record:</td>\n    </tr>\n  </tbody>\n</table>'

In [22]:
hemispheres = hemi_scrape()
hemispheres

NameError: name 'browers' is not defined