###### Import Dependencies & Chrome Driver

In [1]:
# import dependencies
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint
import pymongo
from splinter import Browser
import time

In [2]:
# choose path to driver (Chrome v91)
executable_path = {'executable_path' : 'chromedriver'}
browser = Browser(
    'chrome', 
    **executable_path, 
    headless = False
)

# delay to allow time for page to load
time.sleep(5)

###### Scraping Web Text

In [3]:
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client.mars_db
collection = db.mars_data

In [4]:
# navigate browser to url
browser.visit('https://mars.nasa.gov/news/')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
article_soup = BeautifulSoup(browser.html, 'html.parser')

In [5]:
# narrow HTML elements by 'article'
article_result = article_soup.find('article')

# retrieve most recent article date, title, paragraph
news_date = article_result.find('div', class_ = 'list_date').text.strip()
news_title = article_result.find('div', class_ = 'content_title').text.strip()
news_p = article_result.find('div', class_ = 'article_teaser_body').text.strip()

In [6]:
# # create dictionary for mongodb
# recent_news = {
#     'date' : news_date,
#     'title' : news_title,
#     'teaser' : news_p
# }

###### Scraping Web Image

In [7]:
# navigate browser to url, click full-res image
browser.visit('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html')
browser.links.find_by_partial_text('FULL IMAGE').click()

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
image_soup = BeautifulSoup(browser.html, 'html.parser')

In [8]:
# extract image source url
image_result = image_soup.find(class_ = 'fancybox-image')['src']

In [9]:
# append urls
featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + image_result

In [10]:
# # create dictionary for mongodb
# featured_image = {
#     'image_url' : featured_image_url
# }

###### Converting HTML table to DataFrame, convert back to HTML

In [11]:
# navigate browser to url
browser.visit('https://space-facts.com/mars/')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
fact_soup = BeautifulSoup(browser.html, 'html.parser')

In [12]:
# convert html table to dataframe
fact_df = pd.read_html('https://space-facts.com/mars/')[0]

In [13]:
fact_df.columns = ['Description', 'Mars']
fact_df.set_index('Description', inplace = True)

In [14]:
fact_df

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [15]:
# convert dataframe to html
fact_html = fact_df.to_html(index = False, header = False)

In [16]:
fact_html.replace('\n', '')

'<table border="1" class="dataframe">  <tbody>    <tr>      <td>6,792 km</td>    </tr>    <tr>      <td>6,752 km</td>    </tr>    <tr>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [17]:
# # create dictionary for mongodb
# facts = {
#     'html_table' : fact_html
# }

###### Mars Hemispheres

In [18]:
# navigate browser to url
browser.visit('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')

# delay to allow time for page to load
time.sleep(5)

# create beautifulsoup object
soup = BeautifulSoup(browser.html, 'html.parser')

In [19]:
# grab hemisphere HTML elements
results = soup.find_all('div', class_ = 'item')

In [20]:
core_url='https://astrogeology.usgs.gov'
hemisphere_image_urls=[]
# Loop through each hemisphere item
for item in results:
    # Error handling
    try:
        # Extract title
        hemi = item.find('div', class_ = 'description')
        title = hemi.h3.text
        # Extract image url
        hemi_url = hemi.a['href']
        browser.visit(core_url + hemi_url)
        soup = BeautifulSoup(browser.html, 'html.parser')
        image_src = soup.find('li').a['href']
        # Create dictionary for title and url
        hemi_dict = {'title' : title, 'image_url' : image_src}
        hemisphere_image_urls.append(hemi_dict)
    except Exception as e:
        print(e)

In [21]:
# close browser
browser.quit()

In [22]:
mars_data = {}

mars_data['article_date'] = news_date
mars_data['article_title'] = news_title
mars_data['article_teaser'] = news_p
mars_data['featured_image'] = featured_image_url
mars_data['fact_table'] = fact_html
mars_data['hemispheres'] = hemisphere_image_urls

In [23]:
mars_data

{'article_date': 'June  3, 2021',
 'article_title': "NASA's InSight Mars Lander Gets a Power Boost",
 'article_teaser': 'The spacecraft successfully cleared some dust off its solar panels, helping to raise its energy and delay when it will need to switch off its science instruments.',
 'featured_image': 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg',
 'fact_table': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>',
 'hemispheres': [{'t

In [24]:
# # clear collection
# collection.delete_many({})

# # insert collection to mongodb
# collection.insert_many(mars_data)

In [25]:
# # query all documents from collection
# for document in collection.find():
#     # print each document
#     pprint(document)