In [29]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

import re
import pandas as pd
import time

In [30]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
#!which chromedriver

In [31]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Get latest news article and paragraph

In [4]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url)
time.sleep(2)

In [6]:
#Scape Page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

grid = soup.find('ul', class_='item_list')
article = grid.find('li', class_='slide')
news_title = article.find('div', class_='content_title').text
news_p = article.find('div', class_='article_teaser_body').text

print('Scrapping Complete!')
print(news_title)
print(news_p)

Scrapping Complete!
NASA's Perseverance Rover Will Look at Mars Through These 'Eyes'
A pair of zoomable cameras will help scientists and rover drivers with high-resolution color images.


## Get featured image from Mars website

In [3]:
# URL of page to be scraped
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
time.sleep(2)

In [4]:
#Scrape Page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

#open featured image
browser.click_link_by_partial_text('FULL IMAGE')
#browser.links.find_by_partial_text('FULL IMAGE')
time.sleep(2)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

img_tag = soup.find('img', class_='fancybox-image')
image_relative_path = img_tag['src']

#src has relative path only combine with website url
featured_image_url = 'https://www.jpl.nasa.gov' + image_relative_path

print('Scrapping Complete!')
featured_image_url



## Get latest weather from Twitter page

In [5]:
# URL of page to be scraped
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
time.sleep(2)

In [5]:
#Scrape Page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

mars_weather = soup.find('span', text = re.compile('InSight sol')).text
    
mars_weather = mars_weather.replace('InSight s', 'S') 

print('Scrapping Complete!')
mars_weather

Scrapping Complete!


'Sol 507 (2020-04-30) low -92.1ºC (-133.7ºF) high -2.4ºC (27.7ºF)\nwinds from the SW at 5.0 m/s (11.1 mph) gusting to 18.2 m/s (40.6 mph)\npressure at 6.80 hPa'

## Get Mars fact from planets fact Website

In [32]:
# URL of page to be scraped
url = 'https://space-facts.com/mars/'
browser.visit(url)
time.sleep(2)

In [63]:
#scrape page using pandas
html = browser.html
table = pd.read_html(html)
table_df = pd.DataFrame(table[0])
table_df = table_df.rename(columns = {0: 'Description', 1: 'Value'})
table_df = table_df.set_index('Description')
table_df.index.name=None

mars_fact_table = table_df.to_html(header=False)
mars_fact_table = mars_fact_table.replace('\n', '')

print('Scrapping Complete!')
mars_fact_table

Scrapping Complete!


'<table border="1" class="dataframe">  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

#Scrape Page using beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

mars_fact_table = soup.find('table', class_="tablepress tablepress-id-p-mars")

mars_fact_table = pd.read_html(str(mars_fact_table))

print('Scrapping Complete!')
mars_fact_table

## Get Hemisphere images

In [21]:
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
time.sleep(2)

In [22]:
#Scrape Page
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

#soup.find_all('a', class_='itemLink product-item')

hemisphere_image_urls = []
hemispheres = []
count = 0

results = soup.find_all('a', class_='itemLink product-item')

#get the hemisphere names only from main page
for result in results:
    try:
        hemisphere_name = result.find('h3').text
        hemispheres.append(hemisphere_name)

    except:
        count += 1
        
#cylcle through each hemisphere page and save image url
for hemisphere in hemispheres:
    
    time.sleep(2)
    
    hemisphere_info = {}
    
    #go into page of hemisphere
    browser.click_link_by_partial_text(hemisphere)
    time.sleep(2)
    
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    #scrape relative path from src
    img_relative_path = soup.find('img', class_="wide-image")['src']
    
    #combine relative path of image to get full url
    img_url = 'https://astrogeology.usgs.gov' + img_relative_path
    
    #add information to dictionary and main list
    hemisphere_info['title'] = hemisphere
    hemisphere_info['img_url'] = img_url
    
    hemisphere_image_urls.append(hemisphere_info)
    
    #go back to main page
    browser.back()
    
print('Scrapping Complete!')
hemisphere_image_urls



Scrapping Complete!


[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]

## Combine all scrapped data into a dictionary

In [34]:
data_entry = {}

data_entry['news_title'] = news_title
data_entry['news_p'] = news_p
data_entry['featured_image'] = featured_image_url
data_entry['mars_weather'] = mars_weather
data_entry['mars_fact_table'] = mars_fact_table
data_entry['hemisphere_image_urls'] = hemisphere_image_urls


data_entry

{'mars_fact_table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>5</th>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>6</th>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>7</th>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>8</th>      <td>Recorded By:</td>      <td>Egyptian as