In [13]:
from splinter import Browser
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup

In [14]:
def scraper():
    browser = Browser("chrome", executable_path="chromedriver", headless=True)
    news_title, news_paragraph = mars_news(browser)

    news_data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "hemispheres": hemispheres(browser),
        "weather": twitter_weather(browser),
        "mars_facts": mars_facts(),
        "last_modified_date": dt.datetime.now()
    }

    browser.quit()
    return news_data

In [15]:
def scrape_all():
    browser = Browser("chrome", executable_path="chromedriver", headless=True)
    news_title, news_paragraph = mars_news(browser)
    
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "hemispheres": hemispheres(browser),
        "weather": twitter_weather(browser),
        "facts": mars_facts(),
        "last_modified": dt.datetime.now()
    }
    
    browser.quit()
    return data

In [16]:
def mars_news(browser):
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=0.5)

    html = browser.html
    news_soup = BeautifulSoup(html, "html.parser")

    try:
        slide_element = news_soup.select_one("ul.item_list li.slide")
        news_title = slide_element.find("div", class_="content_title").get_text()
        news_paragraph = slide_element.find("div", class_="article_teaser_body").get_text()
    except AttributeError:
        return None, None    

    return news_title, news_paragraph

In [17]:
def featured_image(browser):
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    full_image_elem = browser.find_by_id("full_image")
    full_image_elem.click()

    browser.is_element_present_by_text("more info", wait_time=0.5)
    more_info_elem = browser.find_link_by_partial_text("more info")
    more_info_elem.click()

    html = browser.html
    img_soup = BeautifulSoup(html, "html.parser")
    img = img_soup.select_one("figure.lede a img")

    try:
        img_url_rel = img.get("src")

    except AttributeError:
        return None

    img_url = f"https://www.jpl.nasa.gov{img_url_rel}"

    return img_url

In [18]:
def hemispheres(browser):
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemisphere_image_url_list = []

    for i in range(4):
            browser.find_by_css("a.product-item h3")[i].click()
            hemisphere_data = scrape_hemisphere(browser.html)
            hemisphere_image_url_list.append(hemisphere_data)
            browser.back()
    
    return hemisphere_image_url_list

In [19]:
def scrape_hemisphere(html_text):
    hemisphere_soup = BeautifulSoup(html_text, "html.parser")

    try:
        title_elem = hemisphere_soup.find("h2", class_="title").get_text()
        sample_elem = hemisphere_soup.find("a", text="Sample").get("href")
    except AttributeError:
        title_elem = None
        sample_elem = None

    hemisphere = {
        "title": title_elem,
        "img_url": sample_elem
    }

    return hemisphere


In [20]:
def twitter_weather(browser):
    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)

    weather_soup = BeautifulSoup(browser.html, "html.parser")

    # Need the class to search for tweet
    tweet_attrs = {"class": "tweet", "data-name": "Mars Weather"}
    weather_tweet = weather_soup.find("div", attrs=tweet_attrs)
    weather_info = weather_tweet.find("p", "tweet-text").get_text()

    return weather_info

In [23]:
def mars_facts():
    try:
        df = pd.read_html("http://space-facts.com/mars/")[0]
    except BaseException:
        return None

    df.columns = ["description", "value"]
    df.set_index("description", inplace=True)

    return df.to_html(classes="table table-striped")

In [24]:
if __name__ == "__main__":
    print (scraper())    

{'news_title': "NASA's Mars 2020 Rover Closer to Getting Its Name", 'news_paragraph': "155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July.", 'featured_image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22911_hires.jpg', 'hemispheres': [{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}], 'w