# Mission to Mars

## Step 1 - Scraping

### Importing dependencies

In [1]:
# Importing dependencies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
import requests
import time

### NASA Mars News
* Scrape NASA mars News site and collect the latest News Title and Paragraph Text

In [2]:
# base url for the website that will be scraped
base_url = 'https://mars.nasa.gov'
# news url that contains the latest News Title and Paragraph text
news_url = 'https://mars.nasa.gov/news/'



In [3]:
# setting up the path for the Chrome driver
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# using splinter browser to execute through Chrome
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# executing the visit to the website
browser.visit(news_url)
# creating a variable to hold the results of the source html
news_html = browser.html
# parsing through the html
news_soup = bs(news_html, 'html.parser')
# print(news_soup.prettify())

In [5]:
# variable to hold the first news story title
news_title = news_soup.find('div',class_='bottom_gradient').text
print(news_title)

Mars InSight Lander Seen in First Images from Space 


In [6]:
# variable to hold the url extension for the latest news title which when added to the base url
# will provide the navigation to the first news title content
news_p_url = news_soup.find('div', class_='image_and_description_container').find_all('a', href=True)[0]['href']
print(news_p_url)

/news/8400/mars-insight-lander-seen-in-first-images-from-space/


In [7]:
# variable to produce the link needed to get to the content of the latest news
p_url = base_url + news_p_url
print(p_url)

https://mars.nasa.gov/news/8400/mars-insight-lander-seen-in-first-images-from-space/


In [8]:
# visiting the p_url in order to obtain the paragraph text
browser.visit(p_url)
# creating a variable to hold the results of the source html
p_html = browser.html
# parsing through the html
p_soup = bs(p_html, 'html.parser')
# print(p_soup.prettify())

In [9]:
# variable to hold the paragraph text
news_p = p_soup.find_all('p')[1].text
print(news_p)

On Nov. 26, NASA's InSight mission knew the spacecraft touched down within an 81-mile-long (130-kilometer-long) landing ellipse on Mars. Now, the team has pinpointed InSight's exact location using images from HiRISE, a powerful camera onboard another NASA spacecraft, Mars Reconnaissance Orbiter (MRO).


### JPL Mars Space Images
* Visiting the JPL url
* Using splinter to navigate the site and find the image url for the current Featured Mars Image
* Save a complete URL string

In [10]:
# setting up the path for the Chrome driver
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# using splinter browser to execute through Chrome
browser = Browser('chrome', **executable_path, headless=False)

In [11]:
# variable to hold the base url
jpl_base_url = 'https://www.jpl.nasa.gov'
# variable to hold url to scrape
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
#visit the jpl_url
browser.visit(jpl_url)

In [12]:
# creating a variable to hold the results of the source html
jpl_html = browser.html
# parsing the html source code
jpl_soup = bs(jpl_html, 'html.parser')
# variable to hold image url    
images = jpl_soup.find('div', class_='carousel_items').find('article')
print(images)  
# not exactly what I wanted

<article alt="Voyager 1 Entering Interstellar Space (Artist Concept)" class="carousel_item" style="background-image: url('/spaceimages/images/wallpaper/PIA17462-1920x1200.jpg');">
<div class="default floating_text_area ms-layer">
<h2 class="category_title">
</h2>
<h2 class="brand_title">
				  FEATURED IMAGE
				</h2>
<h1 class="media_feature_title">
				  Voyager 1 Entering Interstellar Space (Artist Concept)				</h1>
<div class="description">
</div>
<footer>
<a class="button fancybox" data-description="This artist's concept depicts NASA's Voyager 1 spacecraft entering interstellar space. " data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA17462_ip.jpg" data-link="/spaceimages/details.php?id=PIA17462" data-title="Voyager 1 Entering Interstellar Space (Artist Concept)" id="full_image">
					FULL IMAGE
				  </a>
</footer>
</div>
<div class="gradient_container_top"></div>
<div class="gradient_container_bottom"></div>
</article>


In [13]:
# editing code to get the image url
images = jpl_soup.find('div', class_='carousel_items').find('article')['style']
print(images) 
# still not exactly what I wanted

background-image: url('/spaceimages/images/wallpaper/PIA17462-1920x1200.jpg');


In [14]:
# removing -- stripping -- the leading text
images = images.strip("background-image: url(';")

In [15]:
# removing -- stripping -- the trailing text
images = images.strip("')")
print(images)
# got it, this is what I wanted

/spaceimages/images/wallpaper/PIA17462-1920x1200.jpg


In [16]:
# variable to hold the complete url string
featured_image_url = jpl_base_url + images
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA17462-1920x1200.jpg


### Mars Weather
* Visit to Mars Weather twitter account to scrape the latest Mars weather tweet

In [17]:
tweet_url = 'https://twitter.com/marswxreport?lang=en'
tweetRequest = requests.get(tweet_url)
tweet_soup = bs(tweetRequest.text, 'html.parser')
# print(tweet_soup)

In [18]:
mars_weather = tweet_soup.find('div', class_='js-tweet-text-container').text
print(mars_weather)


Sol 2258 (2018-12-13), high -6C/21F, low -70C/-93F, pressure at 8.41 hPa, daylight 06:37-18:51



### Mars Facts
* Visit to the Mars Facts webpage and using Pandas to scrape the table containing facts about the planet
* Using Pandas to convert the data to a HTML table string

In [19]:
# variable to hold the url
facts_url = 'http://space-facts.com/mars/'

In [20]:
# using pandas to read the html from the url and storing table into a variable
tables = pd.read_html(facts_url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 째C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [21]:
# checking the type
type(tables)

list

In [22]:
# converting the list to a dataframe by selecting the first (and only in this case) table from the list
df = tables[0]
# adding column descriptions
df.columns = ['Profile Description','Facts' ]
# removing the first row -- cleanup
df = df.iloc[0:]
# setting the description as the index
df.set_index('Profile Description', inplace=True)
df

Unnamed: 0_level_0,Facts
Profile Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 째C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [23]:
# using the Pandas 'to_html' method to generate HTML table from the existing dataframe
html_table = df.to_html()
html_table


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Facts</th>\n    </tr>\n    <tr>\n      <th>Profile Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-153 to 20 째C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n 

### Mars Hemispheres
* Vising the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres
* Going to each hemisphere's link to find the image url to the full resolution image
* Saving both the image url string and the Hemisphere title 
* Using a Python dictionary and appending the title and image url

In [24]:
# setting up the path for the Chrome driver
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# using splinter browser to execute through Chrome
browser = Browser('chrome', **executable_path, headless=False)
# variable to hold base url for Mars Hemispheres information
mh_base_url = 'https://astrogeology.usgs.gov'
# variable to hold url with Mars Hemispheres information
mh_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
# engaging with chrome browser
browser.visit(mh_url)

In [25]:
# creating an html object
mh_html = browser.html
# parse the html using Beautiful Soup
mh_soup = bs(mh_html, 'html.parser')
# retrieve each link to the hemispheres info
hemispheres = mh_soup.find_all('div', class_='description')#[1].find('a', href=True)['href']
print(len(hemispheres))

4


In [26]:
hemisphere_image_urls = []

for item in hemispheres:
    # retrieving the href
    mh_href = item.find('a', href=True)['href']
    
    # combining the piece to make a full url path
    full_url = mh_base_url + mh_href
    
    # going to website
    browser.visit(full_url)
    
    # takes the html source code and puts it into a variable
    mh_href_html = browser.html
    
    # parse the html using Beautiful Soup
    mh_soup_links = bs(mh_href_html, 'html.parser')
    
    # ffs, we are finally retrieving something
    # retrieving the title
    mh_href_title = mh_soup_links.find('h2', class_="title").text
    
    # retrieving the second a tag / href image link
    mh_href_img = mh_soup_links.find_all('div', class_='downloads')
    for img in mh_href_img:
        img_url = img.find_all('a', href=True)[1]['href']
  
        
    # append the title and image link to the list of dictionaries
    hemisphere_image_urls.append({'title': mh_href_title, 'img_url': img_url})

print(hemisphere_image_urls)  
    


[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]


## Step 2 - MongoDB and Flask Application

### Using MongoDB with Flask to create a new HTML page
* Convert the Jupyter Notebook into a Python script
* Create a route called /scrape
* Create a root route
* Create a template HTML file

See scrape_mars.py, mars_app.py and index.html for results