# Part  1: Scraping

In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import requests
import pymongo
import pandas as pd
import os

## NASA Mars News

In [2]:
# Set up splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [/Users/coreylawson-enos/.wdm/drivers/chromedriver/mac64/101.0.4951.41/chromedriver] found in cache


In [3]:
# Link to website
url = 'https://redplanetscience.com/#'
browser.visit(url)

In [4]:
# Scrape website and store first news article's title and paragraph in variables
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

news_title = soup.find('div', class_='content_title').text
news_p = soup.find('div', class_='article_teaser_body').text

print(news_title)
print(news_p)

Global Storms on Mars Launch Dust Towers Into the Sky
A Mars Dust Tower Stands Out Dust storms are common on Mars. But every decade or so, something unpredictable happens: a series of runaway storms break out, covering the entire planet in a dusty haze.


## JPL Mars Space Images—Featured Image

In [5]:
# Link to website
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [6]:
# Scrape website for featured image URL; includes click for full image
html = browser.html
browser.links.find_by_partial_text('FULL IMAGE').click()
soup = BeautifulSoup(html, 'html.parser')

featured_image_url = 'https://spaceimages-mars.com/' + soup.find('img', class_='headerimage fade-in')['src']

print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars2.jpg


## Mars Facts

In [175]:
# Source url
url = 'https://galaxyfacts-mars.com/'

In [176]:
# Use Panda's `read_html` to parse the url
tables = pd.read_html(url)

mars_facts = tables[0] # Select Mars facts table
mars_facts.columns = ['Description', 'Mars', 'Earth'] # Rename the columns
mars_facts.set_index('Description', inplace=True) # Set the index to the first column

# Convert Mars Facts Data Frame to html, removing unwanted newlines
mars_facts = mars_facts.to_html()
mars_facts = mars_facts.replace('\n', '')
mars_facts

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars</th>      <th>Earth</th>    </tr>    <tr>      <th>Description</th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Mars - Earth Comparison</th>      <td>Mars</td>      <td>Earth</td>    </tr>    <tr>      <th>Diameter:</th>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>Moons:</th>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>Distance from Sun:</th>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>Length of Year:</th>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>Temperature:</th>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

## Mars Hemispheres

In [220]:
# Link to website
url = 'https://marshemispheres.com/'
browser.visit(url)

In [221]:
# Scrape website
html = browser.html
soup = BeautifulSoup(html, 'lxml')

# Collect h3 text for hemisphere links
container = soup.find('div', class_="collapsible results")
products = container.find_all('h3')

# Create list for each link's/image's title and list for combined title and image url
titles = []
hemisphere_image_urls = []

# Collect the hemisphere title names and clean them for later display
for x in range(0, len(products)):
    title_draft = products[x].text
    title_clean = title_draft.replace(' Enhanced','')
    titles.append(title_clean)
    
# Click each link by title, click to obtain each image's html, and create final list of combined title/url
for title in titles:
    browser.links.find_by_partial_text(title).click()
    browser.links.find_by_text('Open').click()
    
    html = browser.html
    soup = BeautifulSoup(html, 'lxml')

    img_url = 'https://marshemispheres.com/' + soup.find('img', class_="wide-image")['src']
   
    dict_entry = {}
    dict_entry['title'] = title
    dict_entry['img_url'] = img_url
    
    hemisphere_image_urls.append(dict_entry)
    

    browser.links.find_by_text('Close').click()
    browser.links.find_by_partial_text('Back').click()
    
print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere', 'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]
