In [1]:
# Step 1: Import the libraries and modules
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo
import pandas as pd
import requests
import re

In [2]:
# Step 2: Initialize Browser function from splinter library (necessary to begin all scrape functions below).
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

In [None]:
# Step 3: Run Scrape functions

In [2]:
# Step 3.1.a
# First Scrape: Scrape latest headine and subheadline from mars.nasa.gov/news/
def scrape_mars_info():
    browser = init_browser()

    # Visit visitcostarica.herokuapp.com
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    time.sleep(1)

    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Get the headline
    title = soup.find('div', class_='content_title')
    title_text = title.text

    # Get the subhead
    paragraph = soup.find('div', class_='article_teaser_body')
    p_text = paragraph.text

    # Store data in a dictionary
    headlines = {
        "headline": title_text,
        "subhead": p_text
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return headlines

In [3]:
# Step 3.1.b
# First scrape verification.  Headline and subhead returned as python dictionary.
scrape_mars_info()

{'headline': 'Mars InSight Lander Seen in First Images from Space ',
 'subhead': "Look closely, and you can make out the lander's solar panels."}

In [9]:
# Step 3.2.a
# Second scrape: Scrape featured image from jpl.nasa.gov/spaceimages.  Make sure image is full size jpg.
def scrape_mars_img():
    browser = init_browser()
    # Visit visitcostarica.herokuapp.com
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    time.sleep(1)

    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Get the current img
    url_text = soup.select('article.carousel_item')[0]['style']
    partial_url = re.search("(?<=').+(?=')",url_text).group()
    full_url = "https://www.jpl.nasa.gov" + partial_url

    # Store data in a dictionary
    img_dict = {
        "featured_image_url": full_url
        }

    #Close the browser after scraping
    browser.quit()

    return img_dict

In [10]:
# Step 3.2.b
# Second scrape verification.  Image returned as python dictionary.
scrape_mars_img()

{'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA19974-1920x1200.jpg'}

In [16]:
# Step 3.3.a
# Third Scrape: Scrape latest Mars weather from Mars Weather Twitter page.
# Note: Tweets are not always weather. Non-weather tweets will not render correctly in html.
def scrape_mars_weather():
    browser = init_browser()

    # Visit visitcostarica.herokuapp.com
    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)

    time.sleep(1)

    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Get the latest tweet
    tweet = soup.find('div', class_='js-tweet-text-container')
    mars_weather = tweet.text

    # Store data in a dictionary
    latest_tweet = {
        "mars_weather": mars_weather
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return latest_tweet

In [17]:
# Step 3.3.b: 
# Third scrape verification.  Weather returned as python dictionary.
scrape_mars_weather()

{'mars_weather': '\nThe InSight lander, discarded heatshield, back shell and parachute have been spotted from orbit by the @HiRISE camera aboard the Mars Reconnaissance Orbiter  https://www.uahirise.org/releases/insight/hardware/\xa0…pic.twitter.com/6BM7QMWrU4\n'}

In [3]:
# Step 3.4.a
# Fourth Scrape: Parse Mars Fact table from space-facts.com/mars
# Parsed HTML table using requests and beautifulsoup. Results from two functions saved in a class.
 
url = "https://space-facts.com/mars/"
response = requests.get(url)
response.text[:100] # Access the HTML with the text property

# Create class to hold results of parse_url and parse_html_table functions
class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        soup = bs(response.text, 'lxml')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df

In [8]:
# 3.4.b
# Put result of HTMLParser class into variable
hp = HTMLTableParser()
# 3.4.c
# Put table variable into pandas DataFrame 
table = hp.parse_url(url)[0][1] # Grabbing the table from the tuple
# 3.4.d
# Call variable to verify table returned as python DataFrame.
table

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km\n"
1,Polar Diameter:,"6,752 km\n"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)\n
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [12]:
# 3.4.e
# Convert DataFrame into HTML
html_table = table.to_html(classes='table',index=False,escape=False)
html_table

'<table border="1" class="dataframe table">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km\\n</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km\\n</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos & Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)\\n</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

<table border="1" class="dataframe table">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km\\n</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km\\n</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos & Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)\\n</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>

In [13]:
# Fourth Scrape Condensed for copy/paste purposes into scrape_mars.py file.

url = "https://space-facts.com/mars/"
response = requests.get(url)
response.text[:100] # Access the HTML with the text property

# Create class to hold results of parse_url and parse_html_table functions
class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        soup = bs(response.text, 'lxml')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df
        # Use HTML TableParser to parse results into table
        hp = HTMLTableParser()
        # Put table variable into pandas DataFrame 
        table = hp.parse_url(url)[0][1] # Grabbing the table from the tuple
        # Convert DataFrame into HTML
        html_table = table.to_html(classes='table',index=False,escape=False)
        # Delete leading '\n' table characters
        html_table.replace('\n', '')
        # Return HTML
        return html_table

In [15]:
# Step 4.1.a
# Fifth Scrape: Scrape titles and image URLs from astrogeology.usgs.gov/search/results.  
# Store results as lists within dictionary inside a single list. First the titles.
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
time.sleep(3)
page_source = browser.html
soup = bs(page_source,"lxml")
hemisphere_titles = [x.text for x in soup.select("h3")]
# Call variable to verify titles stored as a list
hemisphere_titles

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [16]:
# 4.1.b
# Then the image URLs.
all_links = ["https://astrogeology.usgs.gov" + x["href"] for x in soup.select(".item > .product-item")]

In [17]:
# 4.1.c
# Render image verification
browser.visit(all_links[0])

In [18]:
# 4.1.d
# Load URLs into image_links list
image_links = []
for link in all_links:
    browser.visit(link)
    page_source = browser.html
    soup = bs(page_source,"lxml")
    image_link = "https://astrogeology.usgs.gov" + str(soup.select(".wide-image")[0]["src"])
    image_links.append(image_link)
image_links

['https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg']

In [19]:
# 4.1.e
# Zip titles list and image list into dictionary.  Put that dictionary into a list.
# Initialize empty singular parent list
list1 = []
# Function to zip lists into dictionary 
for x,y in zip (hemisphere_titles,image_links):
    # Dictionary to house lists
    temp_dict = {"title":x,"image_url":y}
    # Append the dictionary to the empty list
    list1.append(temp_dict)
# Verify list contains the dictionary of both lists.
list1

[{'title': 'Cerberus Hemisphere Enhanced',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]

In [4]:
# Fifth Scrape Condensed for copy/paste purposes into scrape_mars.py file.
def scrape_mars_hemisphere():
    browser = init_browser()
    #go to url
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(3)
    page_source = browser.html
    soup = bs(page_source,"lxml")
    
    #get the titles
    hemisphere_titles = [x.text for x in soup.select("h3")]
    
    #get the links
    all_links = ["https://astrogeology.usgs.gov" + x["href"] \
        for x in soup.select(".item > .product-item")]
    image_links = []
    
    for link in all_links:
        browser.visit(link)
        page_source = browser.html
        soup = bs(page_source,"lxml")
        image_link = "https://astrogeology.usgs.gov" + str(soup.select(".wide-image")[0]["src"])
        image_links.append(image_link)
    
    #zip titles and links into dictionary within a list
    hemispheres_list = []
    for x,y in zip(hemisphere_titles,image_links):
        hemispheres_dict = {"title":x,"image_url":y}
        hemispheres_list.append(hemispheres_dict)

    # Close the browser after scraping
    browser.quit()

    # Return results
    return hemispheres_list

In [30]:
# Step 4: Put scrape results from each function into a dictionary
def scrape():
     scraped_data = {
         "headlines_dict":scrape_mars_info(),
         "img_dict":scrape_mars_img(),
         "weather_dict":scrape_mars_weather(),
         "table_list":html_table,
         "hemispheres_list":scrape_mars_hemisphere()
     }
     return scraped_data

In [31]:
# 4.1.b
# Scrape dictionary verification
scrape()

{'headlines_dict': {'headline': "NASA's InSight Takes Its First Selfie",
  'subhead': 'Two new image mosaics detail the lander\'s deck and "workspace" — the surface where it will eventually set down its science instruments.'},
 'img_dict': {'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/assets/images/logo_nasa_trio_black@2x.png'},
 'weather_dict': {'mars_weather': '\nSol 2255 (2018-12-10), high -11C/12F, low -71C/-95F, pressure at 8.41 hPa, daylight 06:36-18:50\n'},
 'table_list': '<table border="1" class="dataframe table">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km\\n</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km\\n</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos & Deim

<table border="1" class="dataframe table">\n  <thead>\n    <tr style="text-align: right;">\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km\\n</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km\\n</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos & Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)\\n</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'