In [1]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Step 1 - Scraping

## NASA Mars News

In [2]:
# regular scrapping attempted, but html pulled is not the same as the one displayed
# Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# URL of page to be scraped
mars_news_url = 'https://mars.nasa.gov/news/'

# initialize
nasa_mars_news = []

browser.visit(mars_news_url)

html = browser.html
soup_result = BeautifulSoup(html, 'html.parser')

# Retrieve the parent divs for all articles
results = soup_result.find_all(class_='slide')
# print(results)
# Loop through results 
for result in results:
    news_title = result.find(class_='content_title').text
#     print(news_title)
    try:
        news_p = result.find(class_='article_teaser_body').text
#         print(news_p)
        nmn = {
            "news_title":   news_title,
            "news_p": news_p,
        }
        nasa_mars_news.append(nmn)
    except:
#         news_p = 'not found'
        continue
#     print(result)
browser.quit()
nasa_mars_news

[{'news_title': 'NASA Social Media and Websites Win Webby Awards ',
  'news_p': 'NASA\'s social media presence, the InSight mission social media accounts, NASA.gov and SolarSystem.NASA.gov will be honored at the 2019 Webby Awards - "the Oscars of the Internet."'},
 {'news_title': "NASA's InSight Detects First Likely 'Quake' on Mars",
  'news_p': 'While their causes are still unknown, one of three shaking events looks a lot like the quakes detected on the Moon by the Apollo missions.'},
 {'news_title': "Things Are Stacking up for NASA's Mars 2020 Spacecraft",
  'news_p': 'As the July 2020 launch date inches closer, the next spacecraft headed to the Red Planet is assembled for more testing.'},
 {'news_title': "Curiosity Tastes First Sample in 'Clay-Bearing Unit'",
  'news_p': 'This new region on Mars might reveal more about the role of water on Mount Sharp.'},
 {'news_title': "More Testing for Mars InSight's 'Mole'",
  'news_p': "After the mission's heat probe began hammering last week, 

## JPL Mars Space Images - Featured Image

In [3]:
# Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# first page
# JPL Featured Space Image
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(url)

browser.click_link_by_partial_text('FULL IMAGE')

In [4]:
# second page
browser.click_link_by_partial_text('more info')

In [5]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

image_urls = soup.find_all('div', class_='download_tiff')

for image_url in image_urls:
    if "JPG" in image_url.text:
        featured_image_url = image_url.find('a')['href']
    else:
        continue 

featured_image_url =  'https:'+ featured_image_url
print(featured_image_url)

https://photojournal.jpl.nasa.gov/jpeg/PIA16567.jpg


In [6]:
# close browser
browser.quit()

## Mars Weather

In [7]:
# URL of page to be scraped
# Mars Weather twitter account
mars_weather_twitter_url = 'https://twitter.com/marswxreport?lang=en'

# Retrieve page with the requests module
response = requests.get(mars_weather_twitter_url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

# Retrieve the parent divs for all articles
results = soup.find_all(class_='js-tweet-text-container',limit=1)
# print(results)
# Loop through results
for result in results:
    mars_weather = result.find(class_='TweetTextSize').text
    time_link = result.find(class_='twitter-timeline-link').text

mars_weather = mars_weather.split(time_link)[0]
print(mars_weather)

InSight sol 148 (2019-04-27) low -99.1ºC (-146.4ºF) high -18.2ºC (-0.7ºF)
winds from the W at 4.2 m/s (9.3 mph) gusting to 14.3 m/s (31.9 mph)
pressure at 7.40 hPa


## Mars Facts

In [8]:
# Mars Facts webpage
mars_facts_url = 'http://space-facts.com/mars/'
# Use Panda's `read_html` to parse the url

# tables = pd.read_html(url)
# tables
# results in No tables found -> alternative below leveraging html through pandas

res = requests.get(mars_facts_url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
html_pd = pd.read_html(str(table))

In [9]:
html_df = html_pd[0]
html_df.columns = ['Description', 'Facts']
html_df

Unnamed: 0,Description,Facts
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


## Mars Hemispheres

In [63]:
# USGS Astrogeology site 
astrogeology_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# initialize
hemisphere_image_urls = []

# Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(astrogeology_url)

# Retrieve page with the requests module
response = requests.get(astrogeology_url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

# Retrieve the parent divs for all articles
hemisphere_results = soup.find_all(class_='itemLink')
# print(results)
# Loop through results
for result in hemisphere_results:
    title = result.find(class_='description').text
    browser.click_link_by_partial_text(title)
    html = browser.html
    soup_result = BeautifulSoup(html, 'html.parser')
    image_urls = soup_result.find(class_='content').find(class_='block').find('dl').find('dd').find_next('dd')
    for image_url in image_urls:
        try:
            img_url = image_url ['href']
            hiu = {
                "title":   title,
                "img_url": img_url,
            }
            hemisphere_image_urls.append(hiu)
        except:
            continue
#   always go back to the initial url window
    browser.back()
# close browser 
browser.quit()

In [64]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]

# Step 2 - MongoDB and Flask Application