# 12-Web-Scraping-and-Document-Databases
Eric Nordstrom

### Setup

In [1]:
# dependencies
from selenium import webdriver
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd

# set up selenium driver
driver = webdriver.Firefox()

### NASA Mars News

In [2]:
# get html to parse
url = "https://mars.nasa.gov/news"
driver.get(url)
soup = BS(driver.page_source, "html.parser")

# parse html
item = soup.find('li', class_="slide")
date = item.find('div', class_="list_date").text
title_a = item.find('div', class_="content_title").a
title = title_a.text
href = title_a['href']
para = item.find('div', class_="article_teaser_body").text

# display results
print(date)
print(title)
print()
print(para)
print("\nMore:", "https://mars.nasa.gov" + href)

February 27, 2020
The MarCO Mission Comes to an End

The pair of briefcase-sized satellites made history when they sailed past Mars in 2019.

More: https://mars.nasa.gov/news/8408/the-marco-mission-comes-to-an-end/


### JPL Mars Space Images - Featured Image

In [3]:
# get html to parse
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
r = requests.get(url)
assert(r.status_code == 200)
soup = BS(r.text, "html.parser")

# parse html
button_a = soup.find('a', id="full_image")
featured_image_url = "https://www.jpl.nasa.gov" + button_a['data-fancybox-href']
title = button_a['data-title']
desc = button_a['data-description']

# display results
print(title, desc, "Image:", sep="\n\n", end=" ")
print(featured_image_url)

Neptune - True Color of Clouds

This image of the blue-hued Neptune was taken by NASA's Voyager 2; small trails of similar clouds trending east to west and large scale structure east of the Great Dark Spot all suggest that waves are present in the atmosphere and play a large role.

Image: https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA00063_ip.jpg


### Mars Weather

In [10]:
# get html to parse
url = "https://twitter.com/marswxreport?lang=en"
r = requests.get(url)
assert(r.status_code == 200)
soup = BS(r.text, "html.parser")

# parse html
# for some reason this shows up as a `span` via the inspector, but something
# goes wrong via requests and even selenium. the 'p' tag below was found via
# <str.find> on the request html but does not appear via the inspector.
p = soup.find('p', class_="tweet-text")
mars_weather = p.text.split("pic.twitter.com/")[0]

# display results
print(mars_weather)

InSight sol 447 (2020-02-28) low -92.6ºC (-134.6ºF) high -11.1ºC (12.0ºF)
winds from the SSW at 5.8 m/s (13.1 mph) gusting to 20.2 m/s (45.2 mph)
pressure at 6.30 hPa


### Mars Facts

In [5]:
#get html to parse
url = "http://space-facts.com/mars/"
r = requests.get(url)
assert(r.status_code == 200)

# parse html
# "HTML table string"? i think just a data frame makes sense?
tables = pd.read_html(r.text)

# display results
tables[0]

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [6]:
tables[1]

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


In [7]:
# assign variables
mars_facts = tables[0].rename(columns={0: "Property", 1: "Value"}).set_index("Property").to_html()
earth_comparison = tables[1].rename(columns={"Mars - Earth Comparison": "Property"}).set_index("Property").to_html()

# display results
print(mars_facts)
print()
print(earth_comparison)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Value</th>
    </tr>
    <tr>
      <th>Property</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>

<table border="1" class="dataf

### Mars Hemispheres

In [9]:
# get html to parse
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
driver.get(url)

# parse html on main page
imgs = {}  # initially the urls of the images pages, then replaced with actual image urls
for a in driver.find_elements_by_tag_name('a'):
    if a.get_attribute('class') == "itemLink product-item" and a.find_elements_by_tag_name('h3'):
        imgs[a.text] = a.get_attribute('href')
        
# parse html on each image page
for key, value in imgs.items():
    driver.get(value)
    img = driver.find_element_by_tag_name('img')
    imgs[key] = img.get_attribute('src')

# display results
imgs

{'Cerberus Hemisphere Enhanced': 'https://astrogeology.usgs.gov/images/usgs_logo_main_2x.png',
 'Schiaparelli Hemisphere Enhanced': 'https://astrogeology.usgs.gov/images/usgs_logo_main_2x.png',
 'Syrtis Major Hemisphere Enhanced': 'https://astrogeology.usgs.gov/images/usgs_logo_main_2x.png',
 'Valles Marineris Hemisphere Enhanced': 'https://astrogeology.usgs.gov/images/usgs_logo_main_2x.png'}
