In [1]:
# Import Splinter and BeautifulSoup dependencies
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [2]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

In [None]:
# Visit the mars nasa news site - assign the url and instruct browser to visit
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
# Searching for specific elements of tag combination - <ul class=”item_list”> in HTML
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

In [None]:
html = browser.html
news_soup = soup(html, 'html.parser')
# Set variable for parent element which will hold all other variables within it
# will be references when filtering results further
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [None]:
# Scraping for Title of recent article
slide_elem.find("div", class_='content_title')

In [None]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

In [None]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

### Featured Images

In [3]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [4]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

ElementNotInteractableException: Message: element not interactable: element has zero size
  (Session info: chrome=84.0.4147.89)


In [None]:
# Find the more info button and click that - find element using text
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
more_info_elem.click()

In [5]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [None]:
# Find the relative image url using <figure/> & (class=lede),<a/> (nested), and <img/>(nested) - because image is updated frequently
img_url_rel = img_soup.select_one('figure.lede a img').get("src") # get pulls the link to the image
img_url_rel
# "This is where the image we want lives-use the link that's inside these tags"

In [None]:
# Use the base URL to create an absolute URL using a new variable to hold our f-string of our other image variable
# f strings useful for scraping bc they are evaluated at run-time
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

In [None]:
# Import pandas to scrape the entire table of Mars facts 
# By creating a new data frame [0] index to pull only the first table it encounters or first item on the list
# Assign new columns to the df and turning description column into the index with inplace=True
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df

In [None]:
# Converting df back to HTML code so changes on tables are updated when code is run
df.to_html()

In [None]:
# End the automated browsing session
# To fully automate the code created above it must be converted into a .py file
browser.quit()