# 0. PRE-WORK

In [1]:
#import dependencies
from bs4 import BeautifulSoup
from selenium import webdriver
from splinter import Browser
import pandas as pd
import requests
import time
import re

# 1. NASA MARS NEWS
The following module of code will open NASA's Mars news site, and scrape its code for the latest Mars headline and summary text.

In [2]:
#target news url
news_url = 'https://mars.nasa.gov/news'

In [3]:
#use webdriver to load page with dynamic JS before scraping
driver = webdriver.Chrome()
driver.get(news_url)

#sleep for one interval for JS to load before scrape
time.sleep(1)

In [None]:
#scrape the HTML with BS
news_soup = BeautifulSoup(driver.page_source)

In [None]:
#find first headline + article summary
news_headline = news_soup.find('div', class_='content_title').text
news_teaser = news_soup.find('div', class_='article_teaser_body').text

#confirmation
print("LATEST MARS HEADLINE + SUMMARY:")
print('-' * 50)
print("- " + news_headline)
print("- " + news_teaser)

# 2. JPL MARS IMAGES
The following code module will visit the JPL space images site and scrape the location of the featured headline image.

In [None]:
#target jpl image page
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [None]:
#setup chromedriver
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
#visit url
browser.visit(jpl_url)

In [None]:
#push full image button
jpl_button = browser.find_by_id('full_image')
jpl_button.click()

In [None]:
#soup the html
jpl_soup = BeautifulSoup(browser.html, 'html.parser')

In [None]:
#close browser after scraping HTML
browser.quit()

In [None]:
#isolate the image code
jpl_image = jpl_soup.find('article', class_='carousel_item')

In [None]:
#isolate the tag containing the full-sized image location
jpl_fullimage = jpl_image['style']

In [None]:
#use regex to parse the tag string
jpl_regex = re.search( 'spaceimages/images/wallpaper/\w+-\d+\w\d+.jpg', jpl_fullimage)

In [None]:
#final - image location confirmation
featured_image_url = f'http://jpl.nasa.gov/{jpl_regex.group()}'
print(featured_image_url)

# 3. MARS WEATHER
The following code module will scrape the latest weather update tweet from the @marswxreport account.

In [None]:
#set target url
weather_url = 'https://twitter.com/marswxreport'

In [None]:
#request html
weather_request = requests.get(weather_url)

In [None]:
#parse through HTML with bs
weather_soup = BeautifulSoup(weather_request.text, 'html.parser')

In [None]:
#find latest tweet
mars_weather = weather_soup.find('p', class_='TweetTextSize').text

In [None]:
#use regex to find pic.twitter.com url
tweet_tail = re.search('pic.twitter.com/\w+', mars_weather)

In [None]:
#remove tweet tail for final string
mars_tweet = mars_weather.replace(tweet_tail.group(), "")

In [None]:
#confirmation
print(mars_tweet)

# 4. MARS FACTS
The following code module will scrape a table of Mars facts from the target website.

In [None]:
#set target url
facts_url = 'https://space-facts.com/mars/'

In [None]:
#read table via panda
facts_read = pd.read_html(facts_url)[0]

In [None]:
#convert to HTML
facts_table = facts_read.to_html()
print(facts_table)

# 5. MARS HEMISPHERES
The following code module will extract four images of the Mars hemispheres from the target website.

In [4]:
#target hemisphere url
hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [5]:
#open the browser to url
driver.get(hemi_url)

In [6]:
#scrape the HTML with BS
hemi_soup = BeautifulSoup(driver.page_source)

In [7]:
#find each link leading to the four hemisphere pages
hemi_links = hemi_soup.find_all('a', class_='product-item')

In [13]:
hemi_links

[<a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiaparelli_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><h3>Schiaparelli Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/syrtis_major_enhanced"><img alt="Syrtis Major Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/aae41197e40d6d4f3ea557f8cfe51d15_syrtis_major_enhanced.tif_thumb.png"/></a>,

In [8]:
#holder for scrape of anchor text containing hemisphere names
hemi_names = []

#loop through anchors for text
for x in range(len(hemi_links)):
    #every other link is the target text
    if x%2 == 1:
        
        #split the hemisphere name
        hemi_split = hemi_links[x].text.split()
        
        hemi_holder = []
        
        #removes "enhanced"
        for y in hemi_split:
            if y != "Enhanced":
                hemi_holder.append(y)
        
        hemi_names.append(" ".join(hemi_holder))

In [17]:
#holder for image urls
hemi_images = []

#loop through individual pages and save url names
for j in range(len(hemi_links)):
    if j%2 ==1:      
        d_url = "https://astrogeology.usgs.gov" + hemi_links[j]['href']
        d_request = driver.get(d_url)
        d_soup = BeautifulSoup(driver.page_source)
        d_link = d_soup.find('a', text="Sample")
        hemi_images.append(d_link['href'])

In [None]:
driver.quit()

In [None]:
#combine hemi names with image urls
hemi_dict = []

for z in range(4):
    hemi_dict.append({'title': hemi_names[z], 'img_url': hemi_images[z]})
    print({'title': hemi_names[z], 'img_url': hemi_images[z]})

In [None]:
#create master dictionary

content = {
    "news_headline": news_headline,
    "news_summary": news_teaser,
    "featured_image": featured_image_url,
    "weather": mars_tweet,
    "facts": facts_table,
    "hemi_n": hemi_dict,
    "hemi_i": hemi_images
}

In [None]:
content