In [61]:
# Complete your initial scraping using Jupyter Notebook, BeautifulSoup, Pandas, and Requests/Splinter.
# Create a Jupyter Notebook file called mission_to_mars.ipynb and use this to complete all of your scraping and analysis tasks.
# The following outlines what you need to scrape.

# Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text.
# Assign the text to variables that you can reference later.

# Example:
# news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet"

# news_p = "Preparation of NASA's next spacecraft to Mars, InSight, has ramped up this summer,
#          on course for launch next May from Vandenberg Air Force Base in central California 
#          -- the first interplanetary launch in history from America's West Coast."

# First site:
# https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest

In [62]:
import pandas as pd
import pymongo

In [63]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup

# Windows Users

In [64]:
 executable_path = {'executable_path': 'chromedriver.exe'}
 browser = Browser('chrome', **executable_path, headless=False)

In [65]:
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [66]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [67]:
list_text = soup.find('div', class_= 'list_text')
list_text


<div class="list_text"><div class="list_date">September 21, 2018</div><div class="content_title"><a href="/news/8369/nasa-seeking-partner-in-contest-to-name-next-mars-rover/" target="_self">NASA Seeking Partner in Contest to Name Next Mars Rover</a></div><div class="article_teaser_body">NASA has a class assignment for corporations, nonprofits and educational organizations involved in science and space exploration: partner with the agency to inspire future engineers and scientists by sponsoring a contest to name the next rover to venture to the Red Planet.</div></div>

In [68]:
title = soup.find('div', class_='content_title')
title

<div class="content_title"><a href="/news/8369/nasa-seeking-partner-in-contest-to-name-next-mars-rover/" target="_self">NASA Seeking Partner in Contest to Name Next Mars Rover</a></div>

In [69]:
latest_news_title = title.find('a').text
latest_news_title

'NASA Seeking Partner in Contest to Name Next Mars Rover'

In [70]:
latest_news_paragraph = soup.find('div', class_='article_teaser_body').text
latest_news_paragraph

'NASA has a class assignment for corporations, nonprofits and educational organizations involved in science and space exploration: partner with the agency to inspire future engineers and scientists by sponsoring a contest to name the next rover to venture to the Red Planet.'

In [71]:
# Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string
# to a variable called featured_image_url.
# Make sure to find the image url to the full size .jpg image.
# Make sure to save a complete url string for this image.
# # Example:
# featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'

In [72]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url) 

In [73]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [74]:
full_image = soup.find('a', class_='button fancybox')
full_image

<a class="button fancybox" data-description="In this new view of the Andromeda, also known as M31, galaxy from the Herschel space observatory, cool lanes of forming stars are revealed in the finest detail yet." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA16682_ip.jpg" data-link="/spaceimages/details.php?id=PIA16682" data-title="Cool Andromeda" id="full_image">
					FULL IMAGE
				  </a>

In [75]:
full_image_url = full_image['data-fancybox-href']
full_image_url

'/spaceimages/images/mediumsize/PIA16682_ip.jpg'

In [76]:
featured_image_url = 'https://www.jpl.nasa.gov' + full_image['data-fancybox-href']
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA16682_ip.jpg'

In [77]:
# Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
# Save the tweet text for the weather report as a variable called mars_weather.
# Example:
# mars_weather = 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55'

In [78]:
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url) 

In [79]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [80]:
tweets = soup.find_all('li', class_='js-stream-item')
# tweets

In [81]:
# Find the latest tweet by 'Mars Weather', i.e. not a retweet
for tweet in tweets:
    if tweet.find('a', class_='account-group', href='/MarsWxReport'):
        break
author_block = tweet.find('span', class_='FullNameGroup')
author_block.text

'\nMars Weather\u200f\xa0'

In [82]:
text_container = tweet.find('div', class_='js-tweet-text-container')
text_container

<div class="js-tweet-text-container">
<p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">Sol 2171 (2018-09-14), high -12C/10F, low -65C/-84F, pressure at 8.79 hPa, daylight 05:43-17:59</p>
</div>

In [83]:
mars_weather = text_container.text.strip()
mars_weather

'Sol 2171 (2018-09-14), high -12C/10F, low -65C/-84F, pressure at 8.79 hPa, daylight 05:43-17:59'

In [84]:
# Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet
# including Diameter, Mass, etc.
# Use Pandas to convert the data to a HTML table string.

In [85]:
url = 'https://space-facts.com/mars/'

In [86]:
# Use the read_html function in Pandas to automatically scrape any tabular data from a page.
tables = pd.read_html(url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [87]:
type(tables)

list

In [88]:
df = tables[0]
df.columns = ['Facts about Mars', 'info']
df

Unnamed: 0,Facts about Mars,info
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [89]:
type(df)

pandas.core.frame.DataFrame

In [90]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Facts about Mars</th>\n      <th>info</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millen

In [91]:
# Strip unwanted newlines to clean up the table
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Facts about Mars</th>      <th>info</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>5</th>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>6</th>      <td>Surface Temperature:</td>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>7</th>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>8</th>      <td>Recorded By:</td>      <td>Egyptian ast

In [92]:
df.to_html('table.html', index=False)

In [60]:
html_table = df.to_html('table.html', index=False)

In [33]:
# Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
# You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
# Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name.
# Use a Python dictionary to store the data using the keys img_url and title.
# Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary
# for each hemisphere.
# Example:
# hemisphere_image_urls = [
#     {"title": "Valles Marineris Hemisphere", "img_url": "..."},
#     {"title": "Cerberus Hemisphere", "img_url": "..."},
#     {"title": "Schiaparelli Hemisphere", "img_url": "..."},
#     {"title": "Syrtis Major Hemisphere", "img_url": "..."},
#]

In [34]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [35]:
divs = soup.find_all('div', class_='description')
divs

[<div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div>,
 <div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><h3>Schiaparelli Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 35 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Schiaparelli hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The images were acquired in 1980 during early northern…</p></div>,
 <div class="description"><a 

In [36]:
hemisphere_image_urls = []
for div in divs:
    link = div.find('a')
    href = 'https://astrogeology.usgs.gov' + link['href']
    title = div.find('h3').text
    hemisphere_image_urls.append({'title': title, 'pageURL': href})
    print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'}]
[{'title': 'Cerberus Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'}]
[{'title': 'Cerberus Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'}]
[{'title': 'Cerberus Hemisphere Enhanced', 'pageURL': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'pageURL': 'https://astrogeolo

In [55]:
for hemisphere_image_url in hemisphere_image_urls:
    url = hemisphere_image_url['pageURL']
    print(url)
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_='downloads')
    image_a = div.find('a', text='Sample')
    print(image_a)
    image_link = image_a['href']
    print(image_link + "\n")
    hemisphere_image_url['img_url'] = image_link
print(hemisphere_image_urls)

https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
<a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg" target="_blank">Sample</a>
http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg

https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
<a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg" target="_blank">Sample</a>
http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg

https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
<a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg" target="_blank">Sample</a>
http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg

https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced
<a href="http://as

In [38]:
# client = MongoClient(...)
# db = client.mission_to_mars_db
# collection.insert.one()
# collection.find.one()

In [39]:
# Create connection variable
conn = 'mongodb://localhost:27017'

# Pass connection to the pymongo instance.
client = pymongo.MongoClient(conn)

# Connect to a database. Will create one if not already available.
db = client.mission_to_mars_db

In [93]:
# Drops collection if available to remove duplicates
db.mission_to_mars.drop()

# Creates a collection in the database and inserts two documents
dict = {}
dict['latest_news_title'] = latest_news_title
dict['latest_news_paragraph'] = latest_news_paragraph

dict['featured_image_url'] = featured_image_url

dict['mars_weather'] = mars_weather
dict['table.html'] = html_table

dict['hemisphere_0_title'] = hemisphere_image_urls[0]['title']
dict['hemisphere_0_img_url'] = hemisphere_image_urls[0]['img_url']
dict['hemisphere_1_title'] = hemisphere_image_urls[1]['title']
dict['hemisphere_1_img_url'] = hemisphere_image_urls[1]['img_url']
dict['hemisphere_2_title'] = hemisphere_image_urls[2]['title']
dict['hemisphere_2_img_url'] = hemisphere_image_urls[2]['img_url']
dict['hemisphere_3_title'] = hemisphere_image_urls[3]['title']
dict['hemisphere_3_img_url'] = hemisphere_image_urls[3]['img_url']
print(dict)
db.mission_to_mars.insert_one(dict)

{'latest_news_title': 'NASA Seeking Partner in Contest to Name Next Mars Rover', 'latest_news_paragraph': 'NASA has a class assignment for corporations, nonprofits and educational organizations involved in science and space exploration: partner with the agency to inspire future engineers and scientists by sponsoring a contest to name the next rover to venture to the Red Planet.', 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA16682_ip.jpg', 'mars_weather': 'Sol 2171 (2018-09-14), high -12C/10F, low -65C/-84F, pressure at 8.79 hPa, daylight 05:43-17:59', 'table.html': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Facts about Mars</th>\n      <th>info</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n   

InvalidDocument: key 'table.html' must not contain '.'