# Mission to Mars: A Web Scraping Challenge

# I. NASA Mars News (latest article)

In [7]:
# Step 1 - BeautifulSoup/Splinter Scraping
# NASA Mars News

# Importing dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pymongo

from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager


In [8]:
# Path for Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Establishing URL to pull from (NASA Mars News site)
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

# Using splinter to visit the URL
browser.visit(url)

html = browser.html

# Create BeautifulSoup oject holding html from site
new_soup = soup(html, 'html.parser')

slide_element = new_soup.select_one("ul.item_list li.slide")

news_title = slide_element.find('div', class_="content_title").get_text()
print(news_title)

new_paragraph = slide_element.find('div', class_="article_teaser_body").get_text()
print(new_paragraph)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\gabri\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache




NASA's Perseverance Mars Rover Mission Honors Navajo Language
Working with the Navajo Nation, the rover team has named features on Mars with words from the Navajo language.


# II. JPL Mars Space Images

In [9]:
# Splinter: JPL Mars Space Images - Featured Image

from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

# Path for splinter browser via ChromeDriverManager. Headless is False to be able to see what the browser window is doing
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# URL to the JPL Mars Space Images. We want today's featured image
base_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/'
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

# Get html from the browser
html = browser.html

# Parse html with Beautiful Soup
soup = BeautifulSoup(html, 'lxml')

# Retrieve all elements that contain header information
images = soup.find_all('div', class_="header")

# First header contains image we want
full_header = images[0]
images[0]



[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\gabri\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






<div class="header">
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="#"><img id="logo" src="image/nasa.png"/><span class="logo">Jet Propulsion Laboratory</span>
<span class="logo1">California Institute of Technology</span></a>
<button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarNav" data-toggle="collapse" type="button">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse justify-content-end" id="navbarNav">
<ul class="navbar-nav">
<li class="nav-item active">
<a class="nav-link" href="#"><i aria-hidden="true" class="fa fa-bars"></i>   MENU   <i aria-hidden="true" class="fa fa-search"></i></a>
</li>
</ul>
</div>
</nav>
<div class="floating_text_area">
<h2 class="brand_title">FEATURED IMAGE</h2>
<h1 class="media_feature_title">Dusty Space Cloud</h1>
<br/>
<a class="showimg fancybox-thumbs" href="image/featured/mars3.jpg" target="_blank"> <button class="bt

In [10]:
# Finding the hyper-text reference for the featured image within the header
full_header.find('a', class_= "showimg fancybox-thumbs")['href']

# Combining base url with the href for the featured image
featured_image_url = base_url + str(images[0].find('a', class_="showimg fancybox-thumbs")['href'])
print(featured_image_url)


https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg


# III. Mars Facts

In [11]:
# Pandas Scraping: Mars Facts
# space-facts.com

import pandas as pd
import requests

In [12]:
# 
url = 'https://space-facts.com/mars/'

tables = pd.read_html(requests.get(url).text)

# Select for first html table (equatorial diameter, polar diameter, mass, moons, orbit distance, orbit period, surface temperature, first record, recorded by)
mars_facts_df = tables[0]

# Renaming columns
mars_facts_df = mars_facts_df.rename(columns = {0: "Property", 1: "Observation"})
mars_facts_df

Unnamed: 0,Property,Observation
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [13]:
# Converting this Pandas dataframe into an html table string
html_mars_facts = mars_facts_df.to_html()
# html_mars_facts

# IV. Astrogeology USGS: Mars Hemisphere Images

In [14]:
# Mars Hemisphere Images
# Scraping with splinter

from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

In [15]:
# Path and browser via ChromeDriverManager
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\gabri\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






In [16]:
# URL to astrogeology.usgs
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [17]:
# Establish html from which to scrape hemispheres and associated images

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

url = 'https://astrogeology.usgs.gov'

hemisphere_titles = []
hemisphere_page_hrefs = []

hemispheres = soup.find_all('div', class_='description')

for hemisphere in hemispheres:
    h3 = hemisphere.find('h3').text
    link = hemisphere.find('a')
    href = url + link['href']
    hemisphere_titles.append(h3)
    hemisphere_page_hrefs.append(href)
    
print(hemisphere_titles)
print(hemisphere_page_hrefs)

['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']
['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']


In [5]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Empty list to hold image links
image_hrefs = []

# Iterating through hemisphere page links to extract full-size image href
for page in hemisphere_page_hrefs:
    
    # Establishing new URl to visit is the specific page's URL
    url = page
    browser.visit(url)
    
    # HTML object
    html = browser.html
    
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieve elements that contain image links
    image_content = soup.find('div', class_='downloads')
   
    # Using Beautiful Soup's find() method to navigate and retrieve attributes
    ul = image_content.find('ul')
    li = ul.find('li')
    link = li.find('a')
    href = link['href']
    
    # Appending the href to the full-size image to the list
    image_hrefs.append(href)

print(image_hrefs)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324






[WDM] - Driver [C:\Users\gabri\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


['https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg']


In [18]:
# Storing the full-size image URLs with the hemisphere titles in a list of dictionaries

# Empty list to hold 1 dictionary per hemisphere
hemisphere_image_urls = []

# Iterating through 
for i in range(4):
    # We want a dictionary containing the hemisphere's name and the link to its image
    hemisphere_dict = {
        'title': hemisphere_titles[i],
        'img_url': image_hrefs[i]
    }
    # Appending these dictionaries (1 per hemisphere) to a list
    hemisphere_image_urls.append(hemisphere_dict)

# Print 'em out to check!
print(hemisphere_image_urls)
# print(len(hemisphere_image_urls))

[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]


In [None]:
browser.quit()