In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from urllib.request import urlopen
import re
from pprint import pprint
import pymongo
import pandas as pd
from splinter import Browser

# Step 1 - Scraping
----------------------------------------------------------------
## 01 Mars News

Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) 
and collect the latest News Title and Paragraph Text. Assign the 
text to variables that you can reference later.

In [None]:
# Beautifulsoup does not work
'''
url = 'https://mars.nasa.gov/news/'
response = requests.get(url)
# param = {'page':'0', 'per_page':'40'}
# response = requests.get(url, params = param)
print(response.url)

soup = bs(response.text, 'html5lib')
print(soup.prettify())

use Selenium
from selenium import webdriver
driver = webdriver.Chrome(executable_path = "/anaconda3/envs/wudata/bin/chromedriver")
url ='https://mars.nasa.gov/news/'
driver.get(url)
html = driver.page_source
driver.quit()
# print(html)
soup = bs(html, 'lxml')
# print(soup.prettify())
'''

In [3]:
# use Splinter
browser = Browser("chrome", executable_path = "/anaconda3/envs/wudata/bin/chromedriver", headless=True)
url ='https://mars.nasa.gov/news/'
browser.visit(url)
html = browser.html
browser.quit()
soup = bs(html, 'lxml')

In [4]:
news_title = soup.find('div', 'content_title', 'a').text
news_p = soup.find('div', 'rollover_description_inner').text

print(news_title)
print(news_p)

NASA Seeking Partner in Contest to Name Next Mars Rover
NASA has a class assignment for corporations, nonprofits and educational organizations involved in science and space exploration: partner with the agency to inspire future engineers and scientists by sponsoring a contest to name the next rover to venture to the Red Planet.


## 02 Mars Featured Space Image
* Visit the url for JPL Featured Space Image [here]
(https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`.
* Make sure to find the image url to the full size `.jpg` image.
* Make sure to save a complete url string for this image.

In [7]:
executable_path = {"executable_path": "/anaconda3/envs/wudata/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
html = browser.html
browser.quit()
soup = bs(html, "lxml")
# print(soup.prettify())

In [8]:
# a = soup.findAll('li', class_='slide')

image_url = soup.findAll('a', class_="fancybox")
addresslist = []
for address in image_url:
    addresslist.append(address['data-fancybox-href'])
    
# print(addresslist)
matching = [s for s in addresslist if "largesize" in s]
print(matching[0])

featured_image_url = "https://www.jpl.nasa.gov" + matching[0]
print(featured_image_url)

/spaceimages/images/largesize/PIA22540_hires.jpg
https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22540_hires.jpg


## 03 Mars Weather
* Visit the Mars Weather twitter account [here] (https://twitter.com/marswxreport?lang=en) and scrape the latest 
Mars weather tweet from the page. Save the tweet text for the weather 
report as a variable called `mars_weather`.

In [9]:
url = 'https://twitter.com/marswxreport?lang=en'
response = requests.get(url)
# print(response.url)
soup = bs(response.text, 'lxml')
# print(soup.prettify())

mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text
print(mars_weather)

Sol 2171 (2018-09-14), high -12C/10F, low -65C/-84F, pressure at 8.79 hPa, daylight 05:43-17:59


## 04 Mars Facts
* Visit the Mars Facts webpage [here](http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet 
including Diameter, Mass, etc.
* Use Pandas to convert the data to a HTML table string.

In [10]:
url = 'http://space-facts.com/mars/'
tables = pd.read_html(url)[0]
tables.columns = ['description', 'value']
tables.set_index('description', inplace=True)
tables

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


## 05 Hemisphere Image
* Visit the USGS Astrogeology site [here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) 
to obtain high resolution images for each of Mar's hemispheres.
* You will need to click each of the links to the hemispheres in order 
to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, 
and the Hemisphere title containing the hemisphere name. Use a Python 
dictionary to store the data using the keys `img_url` and `title`.

* Append the dictionary with the image url string and the hemisphere title 
to a list. This list will contain one dictionary for each hemisphere.

In [11]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
response = requests.get(url)
soup = bs(response.text, 'lxml')

In [12]:
imgname = []
pageurls = []
for a in soup.find_all('a', class_= 'itemLink product-item'):
    if a.find('h3') != None:
        imgname.append(a.h3.text)
        pageurls.append('https://astrogeology.usgs.gov' + a['href'])
print(imgname)
print(pageurls)

['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']
['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']


In [13]:
imgurls = []
for url in pageurls:
    response = requests.get(url)
    soup = bs(response.text, 'lxml')
    # print(soup.prettify())
    for a in soup.find_all('a', {'target': '_blank'}, href =True):
        if a.text == 'Original':
            # print(a['href'])
            imgurls.append(a['href'])
print(imgurls)

['http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif', 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif', 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif', 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif']


In [14]:
hemisphere_image_urls = []
for i in range(len(imgurls)):
    hemisphere_image_urls.append({
        "title":imgname[i], "img_url":imgurls[i]
    })
print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]
