# Mission to Mars - Web Scraping

In [24]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [25]:
# Set the executable path and initialize the chrome browser in SPLINTER
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

In [26]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Search for elements with a specific combination of tag (ul and li) and attribute (item_list and slide, respectively). 
# Tell browser to wait one second before searching for components. The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are image-heavy.
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [4]:
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
# Get our parent element
#  . is used for selecting classes, such as item_list, so 'ul.item_list li.slide' pinpoints the <li /> tag with the class of slide 
# and the <ul /> tag with a class of item_list. 
# CSS works from right to left, such as returning the last item on the list instead of the first
# Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of slide and all nested elements within it
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [5]:
# IMPORTANT
# There are two methods used to find tags and attributes with BeautifulSoup:
# .find() is used when we want only the first class and attribute we’ve specified.
# .find_all() is used when we want to retrieve all of the tags and attributes.
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8724/nasa-ula-launch-mars-2020-perseverance-rover-mission-to-red-planet/" target="_self">NASA, ULA Launch Mars 2020 Perseverance Rover Mission to Red Planet</a></div>

In [6]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

'NASA, ULA Launch Mars 2020 Perseverance Rover Mission to Red Planet'

In [7]:
slide_elem.find("div", class_='article_teaser_body')

<div class="article_teaser_body">The agency's Mars 2020 mission is on its way. It will land at Jezero Crater in about seven months, on Feb. 18, 2021. </div>

In [8]:
news_summary = slide_elem.find("div", class_='article_teaser_body').get_text()
news_summary

"The agency's Mars 2020 mission is on its way. It will land at Jezero Crater in about seven months, on Feb. 18, 2021. "

### Featured Images

In [9]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [10]:
# id=“full_image”, in HTML an id is a completely unique identifier
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [11]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
more_info_elem.click()

In [14]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')
print(img_soup)

.placeholder = ''" placeholder="enter email address" type="email" value=""/>
<input class="email_submit" type="submit" value=""/>
</form>
</div>
<div class="gradient_line_divider"></div>
<div class="share">
<h2>Follow JPL</h2>
<div class="social_icons">
<!-- AddThis Button BEGIN -->
<div class="addthis_toolbox addthis_default_style addthis_32x32_style">
<a addthis:userid="NASAJPL" class="addthis_button_facebook_follow icon at300b" href="http://www.facebook.com/NASAJPL" target="_blank" title="Follow on Facebook"><span class="at-icon-wrapper" style="background-color: rgb(59, 89, 152); line-height: 32px; height: 32px; width: 32px;"><svg alt="Facebook" aria-labelledby="at-svg-facebook-5" class="at-icon at-icon-facebook" role="img" style="width: 32px; height: 32px;" title="Facebook" version="1.1" viewbox="0 0 32 32" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><title id="at-svg-facebook-5">Facebook</title><g><path d="M22 5.16c-.406-.054-1.806-.16-3.43-.16-3.

In [15]:
# Find the relative image url
# figure.lede references the <figure /> tag and its class, lede
# a is the next tag nested inside the <figure /> tag
# An img tag is also nested within this HTML, so we’ve included that as well
# .get("src") pulls the link to the image.
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA16153_hires.jpg'

In [16]:
# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
print (f'{img_url},   {img_url_rel}')

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16153_hires.jpg,   /spaceimages/images/largesize/PIA16153_hires.jpg


In [17]:
# Create a new DataFrame from the HTML table - a tidy, Pandas-friendly representation of the HTML table we were just viewing on the website
# - The Pandas function read_html() specifically searches for and returns a list of tables found in the HTML
# index 0 - pull only the first table (or the first item in the list)
# Turn the table into a DataFrame
# Assign columns to the new DataFrame for additional clarity
# Turn the Description column into the DataFrame’s index. inplace=True - updated index remains in place (does NOT reassign the DataFrame to a new variable)

df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

### Hemisphere images

In [27]:
# Visit URL
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [28]:
# Parse the html with soup
html = browser.html
results_soup = soup(html, 'html.parser')
# Find the 'a' tags for the result items
results_items = results_soup.find_all('a', class_="itemLink")

In [29]:
# Get the hrefs of each result link
results_links = []
for item in results_items:
    print (item.get('href'))
    results_links.append(item.get('href'))

# Remove duplicate hrefs
results_links = list(dict.fromkeys(results_links))
results_links

/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced


['/search/map/Mars/Viking/cerberus_enhanced',
 '/search/map/Mars/Viking/schiaparelli_enhanced',
 '/search/map/Mars/Viking/syrtis_major_enhanced',
 '/search/map/Mars/Viking/valles_marineris_enhanced']

In [30]:
# Find the thumnails sources
thumbs_items = results_soup.find_all('img', class_="thumb")
thumbs_links = []
for thumb in thumbs_items:
    #print ( thumb.get('src') )
    thumbs_links.append( thumb.get('src') )
thumbs_links

['/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png',
 '/cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png',
 '/cache/images/55a0a1e2796313fdeafb17c35925e8ac_syrtis_major_enhanced.tif_thumb.png',
 '/cache/images/4e59980c1c57f89c680c0e1ccabbeff1_valles_marineris_enhanced.tif_thumb.png']

In [31]:
# Visit the links
# EXAMPLE - https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
# The article page hrefs will be relative, we need the base URL
base_url = 'https://astrogeology.usgs.gov'

# Initialize list to hold the links & related titles
full_res_list = []

for idx, link in enumerate(results_links):
    print ( f'Visiting .... {base_url}{link}' )
    browser.visit( f'{base_url}{link}' )
    # Parse the html with soup
    html = browser.html
    article_soup = soup(html, 'html.parser')

    #print ( article_soup.find('ul').find_all('a')[1].get('href') )
    #print ( article_soup.find('h2').text )
    # Create a dictionary for each title & url, then add to the list
    # Get the first 'a' tag, the image with text SAMPLE, it is sufficiently large for this purpose
    # The other image is FULL SIZE and multiple megabytes in size, unnecessarily large for this purpose
    full_res_list.append( 
        { 
            'title':article_soup.find('h2').text,
            #'img_url':article_soup.find('ul').find_all('a')[0].get('href')
            'img_url':article_soup.find('ul').find('a').get('href'),
            'thumb_url':thumbs_links[idx]
        }
    )   
    
print ( full_res_list )

Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced
[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'thumb_url': '/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'thumb_url': '/cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced

In [22]:
# End the browser session
browser.quit()

In [1]:
# Use PyMongo to interact with our Mongo database
from flask import Flask, render_template
from flask_pymongo import PyMongo
import scraping
# Connect to Mongo using PyMongo
app = Flask(__name__)
# Use flask_pymongo to set up mongo connection
app.config["MONGO_URI"] = "mongodb://localhost:27017/mars_app"
mongo = PyMongo(app)
mars = mongo.db.mars.find_one()
mars = mongo.db.mars
mars_data = scraping.scrape_all()
print ( mars_data )
mars.update({}, mars_data, upsert=True)


/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
Visiting .... https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced
[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_e

{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}