In [11]:
# Dependencies
import os
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd
import time

In [12]:
# First URL to scrape
mars_news_url = 'https://mars.nasa.gov/news'

In [13]:
def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    return browser

browser = init_browser()

In [17]:
# Visit url with splinter
browser.visit(mars_news_url)
# Add sleep since sometimes web browser doesn't load quickly enough before returning html data for soup
time.sleep(1)


In [19]:
# Get html to process
html = browser.html
# Parse with soup
soup = bs(html, 'html.parser')
# find first element which will be latest news
news_latest = soup.find('li', class_='slide')
#print(news_latest.prettify())

<li class="slide">
 <div class="image_and_description_container">
  <a href="/news/8378/nasas-insight-will-study-mars-while-standing-still/" target="_self">
   <div class="rollover_description">
    <div class="rollover_description_inner">
     The lander's unique science can teach us how planets are born.
    </div>
    <div class="overlay_arrow">
     <img alt="More" src="/assets/overlay-arrow.png"/>
    </div>
   </div>
   <div class="list_image">
    <img alt="This artist's concept depicts NASA's InSight lander after it has deployed its instruments on the Martian surface." src="/system/news_items/list_view_images/8378_PIA22743-320x240.jpg"/>
   </div>
   <div class="bottom_gradient">
    <div>
     <h3>
      NASA's InSight Will Study Mars While Standing Still
     </h3>
    </div>
   </div>
  </a>
  <div class="list_text">
   <div class="list_date">
    October 24, 2018
   </div>
   <div class="content_title">
    <a href="/news/8378/nasas-insight-will-study-mars-while-standing-st

In [20]:
# Get news title
news_title = news_latest.find('div', class_="content_title").text
# Get the teaser as paragraph info
news_p = news_latest.find('div', class_="article_teaser_body").text
#news_p

In [21]:
news_p

"The lander's unique science can teach us how planets are born."

In [24]:
# Begin second scrape
mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(mars_image_url)
# add sleep for website loading delay
time.sleep(1)
full_image_button = browser.find_by_id("full_image")
full_image_button.click()
time.sleep(1)
# Unable to figure out how to click button 'more info' on website so used bs to create link
html = browser.html
soup = bs(html, 'html.parser')
image_url = 'https://www.jpl.nasa.gov' + soup.find('div', class_="buttons").find('a', class_="button")['href']
browser.visit(image_url)

In [25]:
# Get the large image
html = browser.html
soup = bs(html, 'html.parser')
featured_image_url = 'https://www.jpl.nasa.gov' + soup.find('figure').find('a')['href']
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA13664_hires.jpg'

In [26]:
# Scrape third website
twitter_mars_weather_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(twitter_mars_weather_url)
html = browser.html
soup = bs(html, 'html.parser')

In [27]:
#print(soup.prettify())

In [28]:
# Noticed that sometimes the weather isn't always given in tweet so search for text pattern beginning with Sol
weather_tweets = soup.findAll('li', {"class": ["js-stream-item", "stream-item", "stream-item"]})
#weather_tweets[1].find('p').text
for weather_tweet in weather_tweets:
    tweet = weather_tweet.find('p').text
    if(tweet):
        x = tweet.split()
        if(x[0] == 'Sol'):
            break
mars_weather = tweet
mars_weather

'Sol 2174 (2018-09-17), high -22C/-7F, low -68C/-90F, pressure at 8.96 hPa, daylight 05:45-18:01'

In [29]:
# Use pandas to scrape table data
mars_facts_url = "https://space-facts.com/mars"
facts_table = pd.read_html(mars_facts_url)
facts_table[0]

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [30]:
# Assign dataframe
mars_facts_df = facts_table[0]
# Set column values
mars_facts_df.columns = ['Description', 'Values']
# set first column as index
mars_facts_df = mars_facts_df.set_index('Description')
# Save to string add border and class name for table
mars_facts_html_table = mars_facts_df.to_html(border=1, classes="mars_facts_table")

In [31]:
# Remove \n
mars_facts_html_table = mars_facts_html_table.replace("\n", "")
mars_facts_html_table

'<table border="1" class="dataframe mars_facts_table">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Values</th>    </tr>    <tr>      <th>Description</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [32]:
# Scrap fourth website
mars_hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(mars_hemispheres_url)
html = browser.html
soup = bs(html, 'html.parser')

In [33]:
# get items that contain links to pictures for hemispheres
image_hrefs = soup.find_all('div', class_="item")

In [34]:
# Use for loop to go through image_hrefs and extract the image url and title
# This will hold list of image dicts
hemisphere_image_urls = []
for image_href in image_hrefs:
    browser.visit("https://astrogeology.usgs.gov/" + image_href.find('a')['href'])
    html = browser.html
    soup = bs(html, 'html.parser')
    image_dict = {}
    image_dict['title'] = soup.find('h2', class_="title").text
    image_dict['img_url'] = soup.find('a', text="Original")['href']
    hemisphere_image_urls.append(image_dict)

hemisphere_image_urls    

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]

In [35]:
mars_data_dict = {}
mars_data_dict['facts'] = mars_facts_html_table

In [36]:
mars_data_dict

{'facts': '<table border="1" class="dataframe mars_facts_table">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Values</th>    </tr>    <tr>      <th>Description</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'}

# Used for debugging python function
import os
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd
import time

def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    #return Browser('chrome', **executable_path, headless=False)
    return Browser('chrome', **executable_path, headless=False)

def scrape():
    browser = init_browser()
    # create mars data dict that we can insert into mongo
    mars_data_dict = {}
    
    mars_news_url = 'https://mars.nasa.gov/news'
    browser.visit(mars_news_url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, 'html.parser')
    # find first element which will be latest news
    news_latest = soup.find('li', class_='slide')
    #print(news_latest.prettify())
    news_title = news_latest.find('div', class_="content_title").text
    news_p = news_latest.find('div', class_="article_teaser_body").text
    #news_p
    mars_data_dict['news_title'] = news_title
    mars_data_dict['news_p'] = news_p

    mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(mars_image_url)
    time.sleep(1)
    full_image_button = browser.find_by_id("full_image")
    full_image_button.click()
    time.sleep(1)
    # Unable to figure out how to click button 'more info' on website so used bs to create link
    html = browser.html
    soup = bs(html, 'html.parser')
    image_url = 'https://www.jpl.nasa.gov' + soup.find('div', class_="buttons").find('a', class_="button")['href']
    browser.visit(image_url)
    time.sleep(1)
    # Get the large image
    html = browser.html
    soup = bs(html, 'html.parser')
    featured_image_url = 'https://www.jpl.nasa.gov' + soup.find('figure').find('a')['href']
    #featured_image_url
    mars_data_dict['featured_image_url'] = featured_image_url

    twitter_mars_weather_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twitter_mars_weather_url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, 'html.parser')

    weather_tweets = soup.findAll('li', {"class": ["js-stream-item", "stream-item", "stream-item"]})
    # sometimes first tweet isn't always weather so search through tweets
    for weather_tweet in weather_tweets:
        tweet = weather_tweet.find('p').text
        if(tweet):
            x = tweet.split()
            if(x[0] == 'Sol'):
                break
    
    mars_weather = tweet
    mars_data_dict['mars_weather'] = mars_weather

    mars_facts_url = "https://space-facts.com/mars"
    facts_table = pd.read_html(mars_facts_url)
    #facts_table[0]
    mars_facts_df = facts_table[0]
    mars_facts_df.columns = ['Description', 'Values']
    mars_facts_df = mars_facts_df.set_index('Description')
    mars_facts_html_table = mars_facts_df.to_html(border=1, classes="mars_facts_table")
    mars_facts_html_table = mars_facts_html_table.replace("\n", "")
    #mars_facts_html_table
    mars_data_dict['mars_facts_html_table'] = mars_facts_html_table

    mars_hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemispheres_url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, 'html.parser')
    image_hrefs = soup.find_all('div', class_="item")

    hemisphere_image_urls = []
    for image_href in image_hrefs:
        browser.visit("https://astrogeology.usgs.gov/" + image_href.find('a')['href'])
        time.sleep(1)
        html = browser.html
        soup = bs(html, 'html.parser')
        image_dict = {}
        image_dict['title'] = soup.find('h2', class_="title").text
        image_dict['img_url'] = soup.find('a', text="Sample")['href']
        hemisphere_image_urls.append(image_dict)
    #hemisphere_image_urls
    mars_data_dict['hemisphere_image_urls'] = hemisphere_image_urls

    browser.quit()
    return mars_data_dict

temp = scrape()
temp