# Mission to Mars (Data Scraping)

### Running this notebook will also initialize the database for the first visit to the web site

In [1]:
# Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import pymongo
import requests
import time
from pprint import pprint

In [2]:
# url list for scraping
url_mars_news = 'https://mars.nasa.gov/news/'
url_jpl_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
url_mars_weather = 'https://twitter.com/marswxreport?lang=en'
url_mars_facts = 'https://space-facts.com/mars/'
url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# result dictionary; will be saved to MongoDB
planet_data = {}

# Setup connection to MongoDB
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

# Select database and collection to use
db = client.planet_db
planet_coll = db.planet_data

# clear collection documents 
planet_coll.drop()

## Mars News

In [3]:
# Initialize browser; visit appropriate url; pause for 1 second
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_mars_news)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Extract title and teaser from html
news_title = soup.find('div', class_="list_text").find('div', class_="content_title").get_text(strip=True)
news_teaser = soup.find('div', class_="article_teaser_body").get_text(strip=True)

# Save to dictionary
planet_data['news_title'] = news_title
planet_data['news_teaser'] = news_teaser

browser.quit()

## JPL Featured Space Image

In [4]:
# Root web site for image search
image_root = 'https://www.jpl.nasa.gov'

# Initialize browser; visit appropriate url; pause for 1 second
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_jpl_image)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Extract 2nd half of image path using string search for "url" and "jpg"
jpg_path = soup.find('article', class_="carousel_item")["style"]
featured_image_url = image_root + jpg_path[jpg_path.find('url')+5:jpg_path.find('jpg')+3]

# Save to dictionary
planet_data['featured_image_url'] = featured_image_url

browser.quit()

## Weather

In [5]:
# Initialize browser; visit appropriate url; pause for 1 second
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_mars_weather)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Extract weather tweet
weather_tweet = soup.find('div', attrs = {'class': 'css-901oao', 'lang': 'en', 'dir':'auto'}).get_text(strip=True)

# Save to dictionary
planet_data['weather_tweet'] = weather_tweet

browser.quit()



## Facts

In [6]:
# use Pandas to scrape Mars fact table from web site url
tables = pd.read_html(url_mars_facts)

# Build and format DataFrame
mars_facts_df = tables[0]
mars_facts_df.columns = ['attribute', 'value']
mars_facts_df.set_index('attribute', inplace=True)

# Convert df to html table
facts_html = mars_facts_df.to_html()

# Save to dictionary
planet_data['planet_facts_html'] = facts_html


## Hemisphere Photos

In [7]:
# Initialize browser; visit appropriate url; pause for 1 second
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_hemispheres)

image_base_url = 'https://astrogeology.usgs.gov'

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# find all anchors that contain images
images_html = soup.find_all('a', class_='itemLink product-item')

# get url list for all hemispheres
hemisphere_urls = []
for element in images_html:
    if element.find('h3'):
        # print(element.prettify())
        hemisphere_urls.append(image_base_url + element['href'])


# Browse to hemisphere url and get full image path and description
hemisphere_image_urls = []
for h_url in hemisphere_urls:
    # Visit url and sleep; this site loaded quicker
    browser.visit(h_url)
    time.sleep(0.5)
    
    h_html = browser.html
    soup = BeautifulSoup(h_html, 'html.parser')

    # Extract full res title and url
    full_res_title = soup.find('h2', class_="title").get_text(strip=True)
    full_res_url = soup.find('a', text='Sample')['href']

    # Save as dictionary
    hemisphere_image_urls.append({
        'title': full_res_title,
        'img_url': full_res_url
    })


browser.quit()

# Save dictionary to main dictionary
planet_data['hemisphere_image_urls'] = hemisphere_image_urls

In [8]:
# Add final result dictionary as document to MongoDB
planet_coll.insert_one(planet_data)

<pymongo.results.InsertOneResult at 0x186682205c8>

In [9]:
# View final result
pprint(planet_data)

{'_id': ObjectId('5f3564f453960d5eee2b2649'),
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA19920-1920x1200.jpg',
 'hemisphere_image_urls': [{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
                            'title': 'Cerberus Hemisphere Enhanced'},
                           {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
                            'title': 'Schiaparelli Hemisphere Enhanced'},
                           {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
                            'title': 'Syrtis Major Hemisphere Enhanced'},
                           {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
                            'title': 'Valles Marineris Hemisphere Enhanced'}]