# Mission to Mars (Data Scraping)

### Running this notebook will also initialize the database for the first visit to the web site

In [1]:
# All scraping and analysis
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import pymongo
import requests
import time

In [2]:
#url list
url_mars_news = 'https://mars.nasa.gov/news/'
url_jpl_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
url_mars_weather = 'https://twitter.com/marswxreport?lang=en'
url_mars_facts = 'https://space-facts.com/mars/'
url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# result dictionary
planet_data = {}

# Setup connection to mongodb
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

# Select database and collection to use
db = client.planet_db
planet_coll = db.planet_data


## Mars News

In [3]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_mars_news)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

news_title = soup.find('div', class_="list_text").find('div', class_="content_title").get_text(strip=True)
news_teaser = soup.find('div', class_="article_teaser_body").get_text(strip=True)

# print(news_title)
# print('-------------------')
# print(news_teaser)

planet_data['news_title'] = news_title
planet_data['news_teaser'] = news_teaser

browser.quit()

NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light
-------------------
Vast areas of the Martian night sky pulse in ultraviolet light, according to images from NASA’s MAVEN spacecraft. The results are being used to illuminate complex circulation patterns in the Martian atmosphere.


## JPL Featured Space Image

In [4]:
image_root = 'https://www.jpl.nasa.gov'

executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_jpl_image)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

jpg_path = soup.find('article', class_="carousel_item")["style"]

# print(jpg_path)
# print(jpg_path.find('url'))
# print(jpg_path[23:])
featured_image_url = image_root + jpg_path[jpg_path.find('url')+5:jpg_path.find('jpg')+3]
# print(jpg_path[jpg_path.find('url')+5:jpg_path.find('jpg')+3])
# print(featured_image_url)

planet_data['featured_image_url'] = featured_image_url

browser.quit()

https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA16217-1920x1200.jpg


## Weather

In [5]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_mars_weather)

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

weather_tweet = soup.find('div', attrs = {'class': 'css-901oao', 'lang': 'en', 'dir':'auto'}).get_text(strip=True)

planet_data['weather_tweet'] = weather_tweet

browser.quit()



## Facts

In [6]:

tables = pd.read_html(url_mars_facts)
mars_facts_df = tables[0]
mars_facts_df.columns = ['attribute', 'value']
mars_facts_df.set_index('attribute', inplace=True)
# mars_facts_df
facts_html = mars_facts_df.to_html()
planet_data['planet_facts_html'] = facts_html


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>attribute</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\n 

## Hemisphere Photos

In [7]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

browser.visit(url_hemispheres)

image_base_url = 'https://astrogeology.usgs.gov'

time.sleep(1)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

images_html = soup.find_all('a', class_='itemLink product-item')

# get url list for all hemispheres
hemisphere_urls = []
for element in images_html:
    if element.find('h3'):
        # print(element.prettify())
        hemisphere_urls.append(image_base_url + element['href'])
        # print(element.find('h3').get_text(strip=True))
        # print(image_base_url + element['href'])
        # print('--------------------------------------')


# to to hemisphere url and get full image path and description
hemisphere_image_urls = []
for h_url in hemisphere_urls:
    # print(h_url)
    browser.visit(h_url)
    time.sleep(0.5)
    
    h_html = browser.html
    soup = BeautifulSoup(h_html, 'html.parser')

    full_res_title = soup.find('h2', class_="title").get_text(strip=True)
    # print(full_res_title)

    full_res_url = soup.find('a', text='Sample')['href']
    # print(full_res_url)

    hemisphere_image_urls.append({
        'title': full_res_title,
        'img_url': full_res_url
    })



browser.quit()

# hemisphere_urls
planet_data['hemisphere_image_urls'] = hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [None]:
planet_coll.insert_one(planet_data)