In [None]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import datetime as dt
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    news_title, news_paragraph = mars_news(browser)

    url = 'https://marshemispheres.com/'
    browser.visit(url)

    url_list = []
    links = browser.find_by_css('a.product-item img')

    for i in range(len(links)):
            hemisphere = {}
            browser.find_by_css('a.product-item img')[i].click()
            sample_elem = browser.links.find_by_text('Sample').first
            hemisphere['img_url'] = sample_elem['href']
            hemisphere['title'] = browser.find_by_css('h2.title').text
            url_list.append(hemisphere)
            browser.back()
            
    # Run all scraping functions and store results in a dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "facts": mars_facts(),
        "last_modified": dt.datetime.now(),
        "url_title": hemisphere['title'],
        "url_string": hemisphere['img_url'],
        "hemispheres": url_list
    }
    
    # Stop webdriver and return data
    browser.quit()
    return data

In [None]:
def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one('div.list_text')
        # Use the parent element to find the first 'a' tag and save it as 'news_title'
        news_title = slide_elem.find('div', class_='content_title').get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

    except AttributeError:
        return None, None

    return news_title, news_p

In [None]:
def featured_image(browser):
    # Visit URL
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

        executable_path = {'executable_path': ChromeDriverManager().install()}
        browser = Browser('chrome', **executable_path, headless=True)
        
        url = 'https://galaxyfacts-mars.com'
        browser.visit(url)
        
        df = pd.read_html(url)
        
        url_list = []
        links = browser.find_by_css('a.product-item img')

        for i in range(len(links)):
            hemisphere = {}
            browser.find_by_css('a.product-item img')[i].click()
            sample_elem = browser.links.find_by_text('Sample').first
            hemisphere['img_url'] = sample_elem['href']
            hemisphere['title'] = browser.find_by_css('h2.title').text
            url_list.append(hemisphere)
            browser.back()

            return hemisphere

    except AttributeError:
        return None

    # Use the base url to create an absolute url
    img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'

    return img_url

In [None]:
def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]

    except BaseException:
        return None

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars', 'Earth']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html(classes="table table-striped")

if __name__ == "__main__":

    # If running as script, print scraped data
    print(scrape_all())

Code first created for Deliverable 1

In [2]:
# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

html = browser.html
soup_img = soup(html, 'html.parser')

[WDM] - 

[WDM] - Current google-chrome version is 96.0.4664
[WDM] - Get LATEST driver version for 96.0.4664
[WDM] - Get LATEST driver version for 96.0.4664
[WDM] - Trying to download new driver from https://chromedriver.storage.googleapis.com/96.0.4664.45/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\dwest\.wdm\drivers\chromedriver\win32\96.0.4664.45]


In [3]:
img_cerberus = soup_img.find('a', 'itemLink product-item').get('href')

In [4]:
img_cerberus

'/search/map/Mars/Viking/cerberus_enhanced'

In [5]:
img_schia = soup_img.find_all('a', 'itemLink product-item')[2].get('href')

In [6]:
img_schia

'/search/map/Mars/Viking/schiaparelli_enhanced'

In [7]:
img_syrt = soup_img.find_all('a', 'itemLink product-item')[4].get('href')

In [8]:
img_syrt

'/search/map/Mars/Viking/syrtis_major_enhanced'

In [9]:
img_valles = soup_img.find_all('a', 'itemLink product-item')[6].get('href')

In [10]:
img_valles

'/search/map/Mars/Viking/valles_marineris_enhanced'

In [48]:
# img_list = []
# dict_1 = {"img_url" : "https://marshemispheres.com/images/full.jpg", "title" : "Cerberus Hemisphere Enhanced"} 
# dict_2 = {"img_url": "https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg", "title": "Schiaparelli Hemisphere Enhanced"} 
# dict_3 = {"img_url": "https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg", "title": "Syrtis Major Hemisphere Enhanced"} 
# dict_4 = {"img_url": "https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg", "title": "Valles Marineris Hemisphere Enhanced"}

# img_list.append(dict_1)
# img_list.append(dict_2)
# img_list.append(dict_3)
# img_list.append(dict_4)

# print(img_list)

[{'img_url': 'https://marshemispheres.com/images/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg', 'title': 'Syrtis Major Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg', 'title': 'Valles Marineris Hemisphere Enhanced'}]


In [11]:
url = 'https://marshemispheres.com/'
browser.visit(url)

In [12]:
url_list = []
links = browser.find_by_css('a.product-item img')

In [13]:
for i in range(len(links)):
    hemisphere = {}
    browser.find_by_css('a.product-item img')[i].click()
    sample_elem = browser.links.find_by_text('Sample').first
    hemisphere['img_url'] = sample_elem['href']
    hemisphere['title'] = browser.find_by_css('h2.title').text
    url_list.append(hemisphere)
    browser.back()

In [14]:
url_list

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [15]:
browser.quit()