## Setup

In [1]:
#Import Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint
import re
import time

In [2]:
# Initialize Splinter for Mac users. Update the path to chromedriver as needed
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
# !which chromedriver

#executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# browser = Browser('chrome', **executable_path, headless=False)

In [3]:
#Initialize Splinter for Windows. If on Mac, use lines above
executable_path = {'executable_path': "c:\\Users\\dkloe\\OneDrive\\Documents\\Data Viz Bootcamp\\chromedrv\\chromedriver.exe"}
#executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Scrape Mars News

In [4]:
#Set URL of page to be scraped
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
time.sleep(1)

In [5]:
#Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
mars_news_soup = BeautifulSoup(html, 'html.parser')

In [6]:
#Scrape the first article title. While loop to ensure page loaded content
while mars_news_soup.find('div', class_='content_title') is None:
    browser.reload()
    time.sleep(1)
first_title = mars_news_soup.find('div', class_='content_title').text
first_title

"NASA's MRO Completes 60,000 Trips Around Mars"

In [7]:
#Scrape the first article paragraph text. While loop to ensure page loaded content
while mars_news_soup.find('div', class_='article_teaser_body') is None:
    browser.reload()
first_paragraph = mars_news_soup.find('div', class_='article_teaser_body').text
first_paragraph

'The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.'

## Scrape JPL Mars Space Featured Image

In [8]:
#Set URL of page to be scraped
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
time.sleep(1)

In [9]:
# Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
image_soup = BeautifulSoup(html, 'html.parser')

In [10]:
#Scrape the featured image URL
feat_img_url = image_soup.find('a', class_='button fancybox')['data-fancybox-href']
feat_img_full_url = f'https://www.jpl.nasa.gov{feat_img_url}'
feat_img_full_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA09113_ip.jpg'

## Scrape Mars Weather Tweet

In [11]:
#Set URL of page to be scraped
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
time.sleep(1)

In [12]:
# Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
tweet_soup = BeautifulSoup(html, 'html.parser')

In [13]:
#Scrape the first tweet that matches the weather data. Use regex to trim unwanted text
all_tweets = tweet_soup.find_all('p', class_='TweetTextSize')
for tweet in all_tweets:
    if tweet.find(text=re.compile("InSight")):
        weather_tweet = tweet.text
        break
weather_tweet = re.sub("pic.*", "", weather_tweet)
weather_tweet


'InSight sol 167 (2019-05-17) low -100.5ºC (-148.9ºF) high -20.4ºC (-4.6ºF)\nwinds from the SW at 4.7 m/s (10.6 mph) gusting to 13.5 m/s (30.3 mph)\npressure at 7.50 hPa'

## Scrape Mars Facts Table (Pandas)

In [14]:
# Scrape the table of Mars facts
url = 'https://space-facts.com/mars/'
tables = pd.read_html(url)
df = tables[0]
df.columns = ['Description', 'Value']
df

Unnamed: 0,Description,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [15]:
#Convert to HTML table
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium

## Scrape Mars Hemisphere Images

In [16]:
#Set URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
time.sleep(1)

In [17]:
# Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
hemi_soup = BeautifulSoup(html, 'html.parser')

In [18]:
#Populate a list with links for the hemispheres pages
hemi_links = []
links = hemi_soup.find_all('a', {'class':'itemLink product-item','href':True})

for hemi in links:
    if hemi.find(text=re.compile('.*Enhanced')):
        hemi_links.append(hemi['href'])
        print(hemi['href'])
    
hemi_links

/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced


['/search/map/Mars/Viking/cerberus_enhanced',
 '/search/map/Mars/Viking/schiaparelli_enhanced',
 '/search/map/Mars/Viking/syrtis_major_enhanced',
 '/search/map/Mars/Viking/valles_marineris_enhanced']

In [19]:
# Initialize hemisphere_image_urls list and set base URL
hemisphere_image_urls = []
base_url = "https://astrogeology.usgs.gov"

# Loop through the hemisphere links to obtain the images
for hemi in hemi_links:
    # Initialize a dictionary for the hemisphere title and image
    hemi_dict = {}
    
    browser.visit(base_url + hemi)
    
    html = browser.html
    hemipic_soup = BeautifulSoup(html, 'html.parser')
    
    title = hemipic_soup.find('h2', class_='title').text
    img_url = base_url + hemipic_soup.find('img', class_='wide-image')['src']

    hemi_dict["title"] = title
    hemi_dict["img_url"] = img_url

    hemisphere_image_urls.append(hemi_dict)
    
pprint(hemisphere_image_urls)

[{'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]


In [20]:
# Close the browser after scraping
browser.quit()