In [58]:
# Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import html5lib

In [59]:
# Define a empty dictionary to store all scraped data
data_scrape_dict = {}

In [60]:
# NASA Mars news site scrape

news_html = ""

with Browser('chrome', headless=False) as browser:
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    news_html = browser.html
    
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(news_html, 'html.parser')

results = soup.find_all('li', class_='slide')

In [61]:
# Loop through returned results to collect the News Title and Paragragh Text

news_title_all = []
news_para_all = []

for result in results:
    try:
        # Retrieve the title text
        title = result.find('h3')
        n_title = title.text
        news_title_all.append(n_title)

        # Retrieve the paragrah text
        para = result.find('div', class_='rollover_description_inner')
        n_para = para.text
        news_para_all.append(n_para)

    except:
        print("This is an error message!")

In [62]:
# Save the most recent (1st on list) news article to variables and save to data_scrape_dict

news_title = news_title_all[0]
news_p = news_para_all[0]
print(news_title + '\n' + news_p)

data_scrape_dict['news_title'] = news_title
data_scrape_dict['news_p'] = news_p

Take a Walk on Mars -- in Your Own Living Room
When NASA scientists want to follow the path of the Curiosity rover on Mars, they can don a mixed-reality headset and virtually explore the Martian landscape.


In [63]:
# JPL Mars Space Featured Image scrape

#url for jpl main site
jpl_url = 'https://www.jpl.nasa.gov'

image_html = ""

with Browser('chrome', headless=False) as browser:
    url = jpl_url + '/spaceimages/?search=&category=Mars'
    browser.visit(url)
    image_html = browser.html
    
soup = BeautifulSoup(image_html, 'html.parser')

results = soup.find_all('div', class_='carousel_items')

In [64]:
# Loop through returned results to collect image data
for result in results:
    try:
        # Retrieve image title & description
        image_title = result.find('h1', class_ = 'media_feature_title')  
        image_desc = result.a['data-description']
        print(image_title.text)
        print(image_desc)
        print(' ')

        # Retrieve full image url
        image_link = result.a['data-fancybox-href']
        feat_image_url = jpl_url + image_link
        print(feat_image_url)
    except:
        print("This is an error message!")

# Save feat_image_url to data_scrape_dict
data_scrape_dict['feat_image_url'] = feat_image_url


				  Study: Third of Big Groundwater Basins in Distress				
UC Irvine studies using NASA GRACE data find a third of Earth's largest groundwater basins are being rapidly depleted by human use, despite little data about how much water remains.
 
https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA19685_ip.jpg


In [65]:
# Twitter @MarsWxReport scrape

weather_html = ""

with Browser('chrome', headless=False) as browser:
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    weather_html = browser.html
    
soup = BeautifulSoup(weather_html, 'html.parser')

results = soup.find('div', class_='js-tweet-text-container')

In [66]:
# Save the tweet text for the weather report to data_scrape_dict

mars_weather = results.text.strip()
data_scrape_dict['mars_weather'] = mars_weather
mars_weather

'Sol 1848 (Oct 17, 2017), Sunny, high -28C/-18F, low -80C/-112F, pressure at 8.65 hPa, daylight 05:59-17:42'

In [67]:
# Mars facts url
facts_url = 'https://space-facts.com/mars/'

In [68]:
# Use Pandas to automatically scrape any tabular data from facts_url page

table = pd.read_html(facts_url)
table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [83]:
# Make table scrape into a DataFrame

df = table[0]
df.columns = ['Description', 'Value']
df.set_index('Description', inplace=True)
df

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [84]:
# Generate HTML table from DataFrame and strip unwanted new lines and html table tag to clean up the table.

html_table = df.to_html().replace('\n', '').replace('<table border="1" class="dataframe">', '').replace('</table>', '')

# Save html_table to data_scrape_dict
data_scrape_dict['html_table'] = html_table

html_table

'  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Description</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody>'

In [71]:
# Mar's hemispheres image scrape

# Urls for hemisphere images
hemisphere_urls = ['https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced',
                   'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
                   'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
                   'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'
                  ]

image_html = ""

usgs_url = 'https://astrogeology.usgs.gov'

hemisphere_image_urls = []

# Obtain images for each of Mar's hemispheres using for loop.
# The image url string for the full resolution hemipshere image and the hemisphere title are saved in individual dicts. 
# Dicts are saved in a list, hemisphere_image_urls.

for hemi_url in hemisphere_urls:

    with Browser('chrome', headless=False) as browser:
        url = hemi_url
        browser.visit(url)
        image_html = browser.html

    soup = BeautifulSoup(image_html, 'html.parser')
    
    title_results = soup.find('h2', class_ = 'title')
    
    for result in title_results:
        title = title_results.text
        print(title) 

    image_results = soup.find_all('img', class_='wide-image')
    
    for result in image_results:
        image_link = result['src']
        image_url = usgs_url + image_link
        print(image_url)
        
    # Make individal dict for each hemisphere
    img_dict = {
        'title': title,
        'image_url': image_url
    }
    
    # Append dict to hemisphere_image_urls list
    hemisphere_image_urls.append(img_dict)

Valles Marineris Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg
Cerberus Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg
Schiaparelli Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg
Syrtis Major Hemisphere Enhanced
https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg


In [72]:
# List contains one dictionary for each hemisphere
hemisphere_image_urls

# Save hemisphere_image_urls to data_scrape_dict
data_scrape_dict['hemisphere_image_urls'] = hemisphere_image_urls

In [73]:
data_scrape_dict

{'feat_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA19685_ip.jpg',
 'hemisphere_image_urls': [{'image_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg',
   'title': 'Valles Marineris Hemisphere Enhanced'},
  {'image_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
   'title': 'Cerberus Hemisphere Enhanced'},
  {'image_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
   'title': 'Schiaparelli Hemisphere Enhanced'},
  {'image_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
   'title': 'Syrtis Major Hemisphere Enhanced'}],
 'html_table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Description</th>      <th>Value</th>    </tr