###### Import Dependencies & Chrome Driver

In [1]:
# import dependencies
from bs4 import BeautifulSoup as soup
import pandas as pd
from pprint import pprint
import pymongo
from splinter import Browser
import time

In [2]:
# choose path to driver (Chrome v91)
executable_path = {'executable_path' : 'chromedriver'}
browser = Browser(
    'chrome', 
    **executable_path, 
    headless = False
)

# delay to allow time for page to load
time.sleep(5)

###### Scraping Web Text

In [3]:
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client.mars_db
collection = db.mars_data

In [4]:
# navigate browser to url
browser.visit('https://mars.nasa.gov/news/')

# create beautifulsoup object
article_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [5]:
# narrow HTML elements by 'article'
article_result = article_soup.find('article')

# retrieve most recent article date, title, paragraph
news_date = article_result.find('div', class_ = 'list_date').text.strip()
news_title = article_result.find('div', class_ = 'content_title').text.strip()
news_p = article_result.find('div', class_ = 'article_teaser_body').text.strip()

In [6]:
# create dictionary for mongodb
recent_news = {
    'date' : news_date,
    'title' : news_title,
    'teaser' : news_p
}

###### Scraping Web Image

In [7]:
# navigate browser to url, click full-res image
browser.visit('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html')
browser.links.find_by_partial_text('FULL IMAGE').click()

# create beautifulsoup object
image_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [8]:
# extract image source url
image_result = image_soup.find(class_ = 'fancybox-image')['src']

In [9]:
# append urls
featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + image_result

In [10]:
# create dictionary for mongodb
featured_image = {
    'featured_image_url' : featured_image_url
}

###### Converting HTML table to DataFrame, convert back to HTML

In [11]:
# navigate browser to url
browser.visit('https://space-facts.com/mars/')

# create beautifulsoup object
fact_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [12]:
# convert html table to dataframe
fact_df = pd.read_html('https://space-facts.com/mars/')[0]

In [13]:
# convert dataframe to html
fact_html = fact_df.to_html(index = False, header = False)

In [14]:
# create dictionary for mongodb
facts = {
    'html_table' : fact_html
}

###### Mars Hemispheres

In [15]:
# navigate browser to url
browser.visit('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')

# create beautifulsoup object
hemi_soup = soup(browser.html, 'html.parser')

# delay to allow time for page to load
time.sleep(5)

In [16]:
# grab hemisphere HTML elements
hemisphere_content = hemi_soup.find_all('div', class_ = 'item')

In [19]:
# create list for hemisphere info
hemisphere_images = []

# retrieve hemisphere info
for hemisphere in hemisphere_content:
    # create empty dictionary for hemisphere img urls
    hemi_url = {}
    
    # assign image titles for hemispheres
    title = hemisphere.find('div', class_='description').h3.text
    hemi_url['title'] = title
    
    # delay to allow time for page to load
    time.sleep(1)
    browser.find_by_text(title).click()
    hemi_soup = soup(browser.html, 'html.parser')
    
    # find, append img href
    download = hemi_soup.find('div', class_ = "downloads")
    src = download.find('a')
    if src.text == 'Sample':
        img_url = src['href']
        hemi_url['img_url'] = img_url
    
    # append urls to hemisphere dictionary
    hemisphere_images.append(hemi_url)
    time.sleep(1)
    browser.back()

In [20]:
# create dictionary for mongodb
# hemispheres = {
#     'html_table' : fact_html
# }

In [21]:
# close browser
browser.quit()

In [28]:
# clear any existing data
collection.delete_many({})

# load new data
collection.insert_one(recent_news)
collection.insert_one(featured_image)
collection.insert_one(facts)
#collection.insert_one(hemisphere_images)

<pymongo.results.InsertOneResult at 0x220c0688dc0>

In [29]:
# Using find() query all the documents from the collection

for document in collection.find():

    # print each document

    print(document)

{'_id': ObjectId('60bd48976c76e0e9b14d75ec'), 'date': 'June  3, 2021', 'title': "NASA's InSight Mars Lander Gets a Power Boost", 'teaser': 'The spacecraft successfully cleared some dust off its solar panels, helping to raise its energy and delay when it will need to switch off its science instruments.'}
{'_id': ObjectId('60bd48976c76e0e9b14d75ed'), 'featured_image_url': 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg'}
{'_id': ObjectId('60bd48976c76e0e9b14d75ee'), 'html_table': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <