In [1]:
#Step 1 - Scraping

In [2]:
#Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
#Assign the text to variables that you can reference later.

In [2]:
# Dependencies
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from pprint import pprint
import requests
import pymongo
import pandas as pd
import time

In [4]:
# LATEST MARS NEWS

In [5]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [6]:
#print(soup.prettify())

In [7]:
# Extract the title of the HTML document
title = soup.title.text
title

'News  – NASA’s Mars Exploration Program '

In [8]:
# Extract the paragraph of the HTML document
paragraph = soup.find('p').text
paragraph

'Managed by the Mars Exploration Program and the Jet Propulsion Laboratory for NASA’s Science Mission Directorate'

In [9]:
# FEATURED MARS IMAGE

In [10]:
#Use selenium to navigate the site and find the image url for the current Featured Mars Image and assign the url 
#string to a variable called featured_image_url.
#Make sure to find the image url to the full size .jpg image.

In [11]:
#click on the full image button using selenium
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
driver = webdriver.Chrome()
driver.get(url)
python_button = driver.find_element_by_id('full_image')
python_button.click()

In [12]:
#check that the button has been clicked with soup
soup_level1=bs(driver.page_source, 'lxml')
soup_level1.title.text

'Space Images'

In [13]:
#let program catch up to page change
time.sleep(5)
#click on the more info button
python_button = driver.find_element_by_link_text('more info').click()

In [14]:
#grab the appropr
soup_level2=bs(driver.page_source, 'lxml')
url_end = soup_level2.find('img', {'class' : 'main_image' })['src']
image_url="https://www.jpl.nasa.gov" + url_end
image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18322_hires.jpg'

In [15]:
# CURRENT WEATHER ON MARS

In [16]:
#Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. 
#Save the tweet text for the weather report as a variable called mars_weather.

In [17]:
# Set URL and driver
url = "https://twitter.com/marswxreport?lang=en"
#driver = webdriver.Chrome()
driver.get(url)
# Scrape page into soup
soup_level1=bs(driver.page_source, 'lxml')

In [18]:
#latest tweet
latest_tweet=soup_level1.find('p',{'class' : 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'}).text
latest_tweet

'Sol 2060 (May 23, 2018), Sunny, high 4C/39F, low -73C/-99F, pressure at 7.43 hPa, daylight 05:20-17:20'

In [19]:
# MARS FACTS

In [20]:
#Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
#Use Pandas to convert the data to a HTML table string.

In [21]:
#Set URL and scrape the first table using pandas
url = 'https://space-facts.com/mars/'
table = pd.read_html(url)[0]
table.columns = ['Description', 'Value']
table

Unnamed: 0,Description,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [61]:
#Create something that I can convert to html
table_dict = {}
for i in range(len(table.columns)):
    table_dict[f"ch{i}"] = table.columns.values[i]
    for j in range(len(table)):
        if(i==0):
            table_dict[f"rh{j}"] = table.values[j][i]
        else:
            table_dict[f"rd{j}"] = table.values[j][i]
table_dict

{'ch0': 'Description',
 'ch1': 'Value',
 'rd0': '6,792 km',
 'rd1': '6,752 km',
 'rd2': '6.42 x 10^23 kg (10.7% Earth)',
 'rd3': '2 (Phobos & Deimos)',
 'rd4': '227,943,824 km (1.52 AU)',
 'rd5': '687 days (1.9 years)',
 'rd6': '-153 to 20 °C',
 'rd7': '2nd millennium BC',
 'rd8': 'Egyptian astronomers',
 'rh0': 'Equatorial Diameter:',
 'rh1': 'Polar Diameter:',
 'rh2': 'Mass:',
 'rh3': 'Moons:',
 'rh4': 'Orbit Distance:',
 'rh5': 'Orbit Period:',
 'rh6': 'Surface Temperature:',
 'rh7': 'First Record:',
 'rh8': 'Recorded By:'}

In [20]:
#Export table to html document
table.to_html('table.html')

pandas.core.frame.DataFrame

In [23]:
# MARS HEMISPHERES

In [24]:
#Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
#You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
#Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. 
#Use a Python dictionary to store the data using the keys img_url and title.
#Append the dictionary with the image url string and the hemisphere title to a list. 
#This list will contain one dictionary for each hemisphere.

In [25]:
# Set URL and driver
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
#driver = webdriver.Chrome()
driver.get(url)
# Scrape page into soup
soup=bs(driver.page_source, 'lxml')

In [26]:
# save all item tags
results = soup.find_all('div', class_='item')
image_dict = []

# Loop through returned results
for result in results:
    # Error handling
    try:
        dict_item = {'title': "", 'img_url': ""}
        
        #store header
        dict_item['title']=result.find('h3').text
        h3=result.find('h3').text
        
        #click on result
        driver.find_element_by_link_text(h3).click()
        
        #save image url string for the full resolution hemipshere image
        soup_level1=bs(driver.page_source, 'lxml')
        x = soup_level1.find('img', {'class' : 'wide-image'})
        url_end = x["src"]
        image_url="https://astrogeology.usgs.gov" + url_end
        dict_item['img_url']=image_url
        
        #append dict to list
        image_dict.append(dict_item)
        
        #go back
        driver.execute_script("window.history.go(-1)")
        
    except Exception as e:
        print(e)
pprint(image_dict)

[{'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]
