In [1]:
################################################
#00    I/O                                     #
#   a- import libraries.                       #
#   b- s/u chrome driver & splinter browser.   #
#   c- documentation of later I/O urls.        #
################################################

#a
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
w = 3   # wait time in seconds, variable for time.sleep()
import warnings; warnings.simplefilter('ignore')

#b
executable_path = {'executable_path': 'c:/chromedriver.exe'}
browser = Browser('chrome', **executable_path)

#C
# 01- Latest News : url01 = "https://mars.nasa.gov/news/"
# 02- Featured Image : url02 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
# 03- Mars Facts : url03 = "https://space-facts.com/mars/"
# 04- Hemispheres : url04 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [2]:
################################################################################
#01      Latest News                                                           #
#   a- visit url, dump html, and parse with lxml into soup.                    #
#   b- get first occurance of list_text division (most recent news article).   #
#       (one per each li which holds each article data.)                       #
#   c- extract header, date, and summary text and print.                       #
################################################################################

#a
url01 = "https://mars.nasa.gov/news/"
browser.visit(url01)
time.sleep(w)
html01 = browser.html
soup01 = bs(html01, "lxml")

#b
result = soup01.find('div', class_="list_text")

#c
try:
    header = result.find('div', class_='content_title')
    header_text = header.a.text
    summary = result.find('div', class_='article_teaser_body')
    summary_text = summary.text
    raw_date = result.find("div", class_ = "list_date")
    date_text = raw_date.text
except:
    print("error extracting one of the fields")

print(f"Latest News Title --->  {header_text}  ({date_text})")
print(f"Article Summary   --->  {summary_text}")

Latest News Title --->  NASA Engineers Checking InSight's Weather Sensors  (August 24, 2020)
Article Summary   --->  An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.


In [3]:
#########################################################################
#02    Featured Image                                                   #
#   a- visit images url.                                                #
#   b- click buttons to get full address of featured image, largesize.  #
#   c- dump html and parse with lxml into soup.                         #
#   d- extract image url.                                               #
#########################################################################

#a
url02 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url02)
time.sleep(w)

#b
browser.click_link_by_partial_text("FULL IMAGE")
time.sleep(w)
browser.click_link_by_partial_text('more info')
time.sleep(w)

#c
html02 = browser.html
soup02 = bs(html02, "lxml")

#d
try:
    image_raw = soup02.find('figure', class_='lede')
    image_url = image_raw.a['href']
    featured_image_url = "https://www.jpl.nasa.gov" + image_url
except:
    print("unable to find image")

print(f" Featured Image url --->   {featured_image_url}")

 Featured Image url --->   https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA20318_hires.jpg


In [4]:
######################################################
#03      Mars Facts                                  #
#   a- visit url, dump html.                         #
#   b- read url tables into list.                    #
#   c- take first table and change column headers.   #
#   d- create html code to display table.            #
######################################################

#a
url03 = "https://space-facts.com/mars/"
browser.visit(url03)
time.sleep(w)
html03 = browser.html

#b
fact_tables = pd.read_html(html03)

#c
facts_df = fact_tables[0]
facts_df.columns=['inquiry', 'fact']

#d
facts_html_table = facts_df.to_html(justify="center", index=False)

facts_df

Unnamed: 0,inquiry,fact
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [5]:
################################################################
#04     Hemispheres                                            #
#   a- visit url, dump html and parse with lxml into soup.     #
#   b- get list of the four hesphere's html data.              #
#   c- create list of dictionaries for the four hemispheres.   #
################################################################

#a
url04 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url04)
time.sleep(w)
html04 = browser.html
soup04 = bs(html04, "lxml")

#b
results04 = soup04.find_all("div", class_ = "item")

#c
image_dicts =[] 
for result04 in results04:
    image_title = result04.find("h3").text                     # title for dict and text for click to hi-res page
    browser.click_link_by_partial_text(image_title)            # link to high res image
    time.sleep(w)
    temp_html = browser.html
    temp_soup = bs(temp_html, "lxml")
    temp_result = temp_soup.find("div", class_="downloads")
    jpg_url = temp_result.a["href"]                                   # image url for dict
    image_dicts.append({"title": image_title, "img_url" : jpg_url})
    browser.visit(url04)                                              # return to main page for next pass thru loop
    time.sleep(w)
image_dicts

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [6]:
########################
#99  Housekeeping      #
#   - close browser.   #
########################

browser.quit()