###### Import Dependencies & Chrome Driver

In [1]:
# import dependencies
from flask import Flask, render_template
from bs4 import BeautifulSoup as soup
import pandas as pd
from pprint import pprint
import pymongo
from splinter import Browser
import time

In [2]:
# choose path to driver
executable_path = {'executable_path': 'C:/bin/chromedriver'} # probably repoint this to repo for HW
browser = Browser(
    'chrome', 
    **executable_path, 
    headless = False
)

# delay to allow time for page to load
time.sleep(5)

###### Scraping Web Text

In [3]:
# navigate browser to url
browser.visit('https://mars.nasa.gov/news/')

# create beautifulsoup object
article_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [4]:
# narrow HTML elements by 'article'
article_result = article_soup.find('article')

# show elements
print(article_result.prettify())

<article>
 <header id="page_header">
 </header>
 <div class="react_grid_list grid_list_container" data-react-cache-id="GridListPage-0" data-react-class="GridListPage" data-react-props='{"left_column":false,"class_name":"","default_view":"list_view","model":"news_items","view_toggle":false,"search":"true","list_item":"News","title":"News","categories":["19,165,184,204"],"order":"publish_date desc,created_at desc","no_items_text":"There are no items matching these criteria.","site_title":"NASA’s Mars Exploration Program ","short_title":"Mars","site_share_image":"/system/site_config_values/meta_share_images/1_mars-nasa-gov.jpg","per_page":null,"filters":"[ [ \"date\", [ [ \"2021\", \"2021\" ], [ \"2020\", \"2020\" ], [ \"2019\", \"2019\" ], [ \"2018\", \"2018\" ], [ \"2017\", \"2017\" ], [ \"2016\", \"2016\" ], [ \"2015\", \"2015\" ], [ \"2014\", \"2014\" ], [ \"2013\", \"2013\" ], [ \"2012\", \"2012\" ], [ \"2011\", \"2011\" ], [ \"2010\", \"2010\" ], [ \"2009\", \"2009\" ], [ \"2008\", 

In [5]:
# retrieve most recent article date, title, paragraph
news_date = article_result.find('div', class_ = 'list_date').text.strip()
news_title = article_result.find('div', class_ = 'content_title').text.strip()
news_p = article_result.find('div', class_ = 'article_teaser_body').text.strip()

In [6]:
# return article information
print(f'{news_title}, {news_date}')
print(f'---')
print(f'{news_p}')

NASA's InSight Mars Lander Gets a Power Boost, June  3, 2021
---
The spacecraft successfully cleared some dust off its solar panels, helping to raise its energy and delay when it will need to switch off its science instruments.


###### Scraping Web Image

In [7]:
# navigate browser to url, click full-res image
browser.visit('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html')
browser.links.find_by_partial_text('FULL IMAGE').click()

# create beautifulsoup object
image_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [8]:
# extract image source url
image_result = image_soup.find(class_ = 'fancybox-image')['src']

In [9]:
# append urls
featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + image_result

# show featured image url
print(f'{featured_image_url}')

https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg


###### Converting HTML table to DataFrame, convert back to HTML

In [10]:
# navigate browser to url
browser.visit('https://space-facts.com/mars/')

# create beautifulsoup object
fact_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [11]:
# convert html table to dataframe
fact_table = pd.read_html('https://space-facts.com/mars/')[0]

# show dataframe
fact_table

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [12]:
# convert dataframe to html
fact_html = fact_table.to_html(index = False, header = False)

# show table html
fact_html

'<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

###### Mars Hemispheres

In [13]:
# navigate browser to url
browser.visit('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')

# create beautifulsoup object
hemi_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [14]:
# narrow HTML elements by 'xxx'
hemisphere_result = hemi_soup.find()

# show elements
print(hemisphere_result.prettify())

<html lang="en">
 <head>
  <link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <title>
   Astropedia Search Results | USGS Astrogeology Science Center
  </title>
  <meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
  <meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verification"/>
  <!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
  <link href="/css/main.css" media="screen" rel="stylesheet"/>
  <link href="/css/print.css" media="print" rel="styles

In [15]:
# grab hemisphere HTML elements
hemisphere_content = hemi_soup.find_all('div', class_ = 'item')

In [16]:
# create list for hemisphere info
hemisphere_images = []

# retrieve hemisphere info
for hemisphere in hemisphere_content:
    hemi_url = {}
    title = hemisphere.find('div', class_='description').h3.text
    hemi_url['title'] = title
    
    # delay to allow time for page to load
    time.sleep(1)
    browser.find_by_text(title).click()
    hemi_soup = soup(browser.html, 'html.parser')
    
    # append img urls for full-res imgs
    download = hemi_soup.find('div', class_ = "downloads")
    download_link = download.find('a')
    if download_link.text == 'Sample':
        img_url = download_link['href']
        hemi_url['img_url'] = img_url
    
    # append urls to hemisphere dictionary
    hemisphere_images.append(hemi_url)
    time.sleep(1)
    browser.back()

In [17]:
# show hemisphere dictionary
hemisphere_images

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [18]:
# close browser
browser.quit()