###### Import Dependencies & Chrome Driver

In [1]:
# import dependencies
from bs4 import BeautifulSoup as soup
import pandas as pd
from pprint import pprint
import pymongo
from splinter import Browser
import time

In [2]:
# choose path to driver (Chrome v91)
executable_path = {'executable_path' : 'chromedriver'}
browser = Browser(
    'chrome', 
    **executable_path, 
    headless = False
)

# delay to allow time for page to load
time.sleep(5)

###### Scraping Web Text

In [3]:
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client.mars_db
collection = db.mars_data

In [4]:
# navigate browser to url
browser.visit('https://mars.nasa.gov/news/')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
article_soup = soup(browser.html, 'html.parser')

In [5]:
# narrow HTML elements by 'article'
article_result = article_soup.find('article')

# retrieve most recent article date, title, paragraph
news_date = article_result.find('div', class_ = 'list_date').text.strip()
news_title = article_result.find('div', class_ = 'content_title').text.strip()
news_p = article_result.find('div', class_ = 'article_teaser_body').text.strip()

In [6]:
# # create dictionary for mongodb
# recent_news = {
#     'date' : news_date,
#     'title' : news_title,
#     'teaser' : news_p
# }

###### Scraping Web Image

In [7]:
# navigate browser to url, click full-res image
browser.visit('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html')
browser.links.find_by_partial_text('FULL IMAGE').click()

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
image_soup = soup(browser.html, 'html.parser')

In [8]:
# extract image source url
image_result = image_soup.find(class_ = 'fancybox-image')['src']

In [9]:
# append urls
featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + image_result

In [10]:
# # create dictionary for mongodb
# featured_image = {
#     'image_url' : featured_image_url
# }

###### Converting HTML table to DataFrame, convert back to HTML

In [36]:
# navigate browser to url
browser.visit('https://space-facts.com/mars/')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
fact_soup = soup(browser.html, 'html.parser')

In [37]:
# convert html table to dataframe
fact_df = pd.read_html('https://space-facts.com/mars/')[0]

In [38]:
fact_df.columns = ['Description', 'Mars']
fact_df.set_index('Description', inplace = True)

In [39]:
fact_df

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [40]:
# convert dataframe to html
fact_html = fact_df.to_html(index = False, header = False)

In [41]:
fact_html.replace('\n', '')

'<table border="1" class="dataframe">  <tbody>    <tr>      <td>6,792 km</td>    </tr>    <tr>      <td>6,752 km</td>    </tr>    <tr>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [45]:
# # create dictionary for mongodb
# facts = {
#     'html_table' : fact_html
# }

###### Mars Hemispheres

In [46]:
# navigate browser to url
browser.visit('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
hemi_soup = soup(browser.html, 'html.parser')

In [47]:
# grab hemisphere HTML elements
hemisphere_content = hemi_soup.find_all('div', class_ = 'item')

In [48]:
hemisphere_content

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/08eac6e22c07fb1fe72223a79252de20_schiapa

In [None]:
# create list for hemisphere info
hemispheres = {}

# retrieve hemisphere info
for hemisphere in hemisphere_content:
    # assign image titles for hemispheres
    hemi_name = hemisphere.find('div', class_='description').h3.text
    # delay to allow time for page to load
    time.sleep(1)
    browser.find_by_text(hemi_name).click()
    hemi_soup = soup(browser.html, 'html.parser')
    # find, append img href
    download = hemi_soup.find('div', class_ = "downloads")
    src = download.find('a')
    if src.text == 'Sample':
        hemi_url = src['href']
    hemispheres[hemi_name] = hemi_url
    time.sleep(1)
    browser.back()

In [None]:
hemispheres = []

for hemisphere in hemisphere_content:
    hemi_dict = {}
    title = hemisphere.find('div', class_='description').h3.text
    browser.find_by_text(title).click()
    time.sleep(1)
    soup = soup(browser.html, 'html.parser')
    download = soup.find('div', class_ = "downloads")
    src = download.find('a')
    if src.text == 'Sample':
        img_url = src['href']
    hemi_dict[title] = img_url
    hemispheres.append(hemi_dict)
    browser.back()
    time.sleep(1)

In [None]:
# close browser
browser.quit()

In [None]:
# clear collection
collection.delete_many({})

# insert collection to mongodb
collection.insert_many([recent_news, featured_image, facts, hemispheres])

In [None]:
# query all documents from collection
for document in collection.find():
    # print each document
    pprint(document)