******************************************************************************
# Homework Assignment:
# 12-Web Scraping and Document Databases - Mission to Mars

@Author: Jeffery Brown (daddyjab)
@Date: 2/19/19

******************************************************************************


# Dependencies

In [1]:
# Pandas for DataFrames
import pandas as pd

# Web Requests
import requests

# Splinter and BeautifulSoup for Web Scraping (+ Pandas)
from splinter import Browser
from bs4 import BeautifulSoup

# SQLAlchemy and PyMongo for MongoDB operations
from sqlalchemy import create_engine
import pymongo

# Pretty Print to help with debugging
from pprint import pprint

# Json - IF JSON FILE EXPORT/IMPORT IS NEEDED
# import json

# Time - IF SLEEP OR OTHER TIME FUNCTIONS NEEDED
# import time


# Scraping

## NASA Mars News

In [2]:
# NASA Mars News website
url_nasa_mars_base = 'https://mars.nasa.gov'
url_nasa_mars_news = url_nasa_mars_base + '/news'
url_nasa_mars_news

'https://mars.nasa.gov/news'

In [3]:
# Setup the Splinter browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# Use Splinter to navigate to the page
browser.visit( url_nasa_mars_news )

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(browser.html, 'lxml')

In [5]:
# The articles are in list elements with class 'slide'.
# Get the first of these list elements, which will be the most recent article
news_info = soup.find('li', class_ = 'slide')

In [6]:
news_info_url = url_nasa_mars_base + news_info.find('a')['href']
news_info_url

'https://mars.nasa.gov/news/8415/insight-is-the-newest-mars-weather-service/'

In [7]:
news_info_date = news_info.find('div', class_ = 'list_date').text
news_info_date

'February 19, 2019'

In [8]:
news_info_title = news_info.find('div', class_ = 'content_title').text
news_info_title

'InSight Is the Newest Mars Weather Service'

In [9]:
news_info_teaser = news_info.find('div', class_ = 'article_teaser_body').text
news_info_teaser

"By collecting data around the clock, NASA's lander will provide unique science about the Martian surface."

In [10]:
news_info_image_url = url_nasa_mars_base + news_info.find('div',class_ = 'list_image').find('img')['src']
news_info_image_url

'https://mars.nasa.gov/system/news_items/list_view_images/8415_PIA22876-th.jpg'

In [11]:
# Put all of the Mars News information in a dictionary
news_info_dict = {
    'news_info_date': news_info_date,
    'news_info_title': news_info_title,
    'news_info_teaser': news_info_teaser,
    'news_info_url': news_info_url,
    'news_info_image_url' : news_info_image_url
}
news_info_dict

{'news_info_date': 'February 19, 2019',
 'news_info_title': 'InSight Is the Newest Mars Weather Service',
 'news_info_teaser': "By collecting data around the clock, NASA's lander will provide unique science about the Martian surface.",
 'news_info_url': 'https://mars.nasa.gov/news/8415/insight-is-the-newest-mars-weather-service/',
 'news_info_image_url': 'https://mars.nasa.gov/system/news_items/list_view_images/8415_PIA22876-th.jpg'}

## JPL Mars Space Images - Featured Image

In [12]:
# NASA JPL website
url_nasa_jpl_base = 'https://www.jpl.nasa.gov'
url_nasa_jpl_mars = url_nasa_jpl_base + '/spaceimages/?search=&category=Mars'
url_nasa_jpl_mars

'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [13]:
# Use Splinter to navigate to the page
browser.visit( url_nasa_jpl_mars )

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(browser.html, 'lxml')

In [14]:
# The articles are in list elements with class 'slide'.
# Get the first of these list elements, which will be the most recent article
featured_image_info = soup.find('a', id = 'full_image')

In [15]:
featured_image_details_url = url_nasa_jpl_base + featured_image_info['data-link']
featured_image_details_url

'https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA16815'

In [16]:
# Click the link to get to the details page,
# so we can get the high resolution featured picture
browser.visit(featured_image_details_url)

In [17]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(browser.html, 'lxml')

In [18]:
# Now, get the URL for the high resolution picture
featured_image_details_info = soup.find('img',class_ = 'main_image')

featured_image_url = url_nasa_jpl_base + featured_image_details_info['src']
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16815_hires.jpg'

In [19]:
# Also, while we're here... get the image caption, too
featured_image_title = featured_image_details_info['title']
featured_image_title

"This image shows the first holes into rock drilled by NASA's Mars rover Curiosity, with drill tailings around the holes plus piles of powdered rock collected from the deeper hole and later discarded."

In [20]:
# Put all of the JPL Mars Featured Image information in a dictionary
featured_image_dict = {
    'featured_image_title': featured_image_title,
    'featured_image_url': featured_image_url
}
featured_image_dict

{'featured_image_title': "This image shows the first holes into rock drilled by NASA's Mars rover Curiosity, with drill tailings around the holes plus piles of powdered rock collected from the deeper hole and later discarded.",
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16815_hires.jpg'}

## Mars Weather

In [21]:
# NASA JPL website
url_twitter_mars_base = 'https://twitter.com/marswxreport?lang=en'
url_twitter_mars = url_twitter_mars_base

In [22]:
# Use Splinter to navigate to the page
browser.visit( url_twitter_mars )

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(browser.html, 'lxml')

In [23]:
# The articles are in list elements with class 'slide'.
# Get the first of these list elements, which will be the most recent article
mars_weather_info = soup.find_all('div', class_ = 'tweet')
for mwi in mars_weather_info:
    mwi_item = mwi.find('strong', class_ = 'fullname', string='Mars Weather')
    if mwi_item:
        print( mwi_item )
        break;


<strong class="fullname show-popup-with-id u-textTruncate " data-aria-label-part="">Mars Weather</strong>


In [24]:
mars_weather = mwi.find('div', class_ = 'js-tweet-text-container').text.strip()
mars_weather

'InSight sol 81 (2019-02-17), high -17/2F, low -95/-138F, pressure at 7.23hPa, winds from the WNW at 12 mph gusting to 37.8 mph\n\nWelcome to the Mars Weather team @NASAInSight!\nhttps://mars.nasa.gov/insight/weather/\xa0…pic.twitter.com/SH12FvcMfv'

In [25]:
mars_weather_url = mwi.find('a', class_ = 'twitter-timeline-link')['href']
mars_weather_url

'https://t.co/2EDVfFcJhp'

In [26]:
# Put all of the Twitter Mars Weather information in a dictionary
mars_weather_dict = {
    'mars_weather': mars_weather,
    'mars_weather_url': mars_weather_url
}

## Mars Facts

In [27]:
#http://space-facts.com/mars/
url_space_facts_base = 'http://space-facts.com/mars'
url_space_facts = url_space_facts_base

In [28]:
# Use Splinter to navigate to the page
browser.visit( url_space_facts )

In [29]:
space_facts_tables_df = pd.read_html(str(browser.html), attrs = {'id':'tablepress-mars'})
mars_facts_df = space_facts_tables_df[0]

In [30]:
mars_facts_df.rename( columns = {0:'Fact', 1: 'Mars'}, inplace=True )
mars_facts_df

Unnamed: 0,Fact,Mars
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [31]:
mars_facts_table = mars_facts_df.to_html( na_rep='', index = False )


## Mars Hemispheres

In [32]:
#https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
url_usgs_astro_base = 'https://astrogeology.usgs.gov'
url_usgs_astro = url_usgs_astro_base + '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
url_usgs_astro

'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [33]:
# Use Splinter to navigate to the page
browser.visit( url_usgs_astro )

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(browser.html, 'lxml')

In [34]:
hemiphere_image_info = soup.find_all('div', class_ = 'description')


In [35]:
# Initialize a list of hemisphere image information
hemiphere_image_list = []

# Loop through all of the hemispheres listed on the page
for h in hemiphere_image_info:
    
    # Get the image title
    h_title = h.find('h3').text
    
    # Get the URL of the details page (where the full resolution image can be found)
    h_details_url = url_usgs_astro_base + h.find('a', class_ = 'itemLink product-item')['href']
    
    print (h_title)
    print (h_details_url)
    
    # Click to visit the details page
    browser.visit( h_details_url )
    
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(browser.html, 'lxml')
    
    # Get the link to the full resolution image (1024x1024)
    h_full_image_url = soup.find('div', class_ = 'downloads').find('a')['href']
    print (h_full_image_url)
    
    # Get the description of the full resolution image
    h_full_image_desc = soup.find('div', class_ = 'content').find('p').text
    print( h_full_image_desc )
    print ("-"*40)
    
    # Add a dictionary of this hemisphere info to the list
    h_dict = {
        'h_title': h_title,
        'h_full_image_url': h_full_image_url,
        'h_full_image_desc': h_full_image_desc
    }
    
    hemiphere_image_list.append( h_dict )


Cerberus Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired on February 11, 1980. At that time, it was early northern summer on Mars. The center of the image is at latitude 3 degrees, longitude 185 degrees.
----------------------------------------
Schiaparelli Hemisphere Enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
Mosaic of the Schiaparelli hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The images were acquired in 1980 during early northern summer on Mars. The

## Consolidate the Gathered Information

In [36]:
# Populate the gathered information into a single dictionary
mars_info_dict = {
    'news_info_dict': news_info_dict,
    'featured_image_dict': featured_image_dict,
    'mars_weather_dict': mars_weather_dict,
    'mars_facts_table': mars_facts_table,
    'hemiphere_image_list': hemiphere_image_list
}

# Store in MongoDB

In [37]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [38]:
# Define database and collection
db = client.mars_info

In [39]:
db.mars_info.drop()

In [40]:
result = db.mars_info.insert_one( mars_info_dict )

In [41]:
# Read back what's in the database - just to check
# Display items in MongoDB collection
m_info = db.mars_info.find()

In [42]:
pprint(m_info[0])

{'_id': ObjectId('5c6ce240aaa45b99818eef5a'),
 'featured_image_dict': {'featured_image_title': 'This image shows the first '
                                                 'holes into rock drilled by '
                                                 "NASA's Mars rover Curiosity, "
                                                 'with drill tailings around '
                                                 'the holes plus piles of '
                                                 'powdered rock collected from '
                                                 'the deeper hole and later '
                                                 'discarded.',
                         'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16815_hires.jpg'},
 'hemiphere_image_list': [{'h_full_image_desc': 'Mosaic of the Cerberus '
                                                'hemisphere of Mars projected '
                                                'into point 