# Step 1 - Scraping #

Use Jupyter Notebook, BeautifulSoup, Pandas, and Requests/Splinter.

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from splinter import Browser

### NASA Mars News

Scrape the Mars News Site (https://redplanetscience.com/) and;
collect the latest News Title and Paragraph Text. 
Assign the text to variables that you can reference later.

In [2]:
# URL of page to be scraped
url_nasa_mars_news = 'https://mars.nasa.gov/news/'
    # Tried using link provided, couldn't get it to work. 
    # Was advised by classmate, to use above link.
    # Was later advised by AskBSC redplanetscience was correct link.
    # Will use links provided in readme, moving forward.

# Retrieve page, with requests module
response_mars = requests.get(url_nasa_mars_news)

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response_mars.text, 'html.parser')
# print(soup.prettify())

In [4]:
# Find all div elements and class, for news titles
news_titles = soup.find_all('div', class_="slide")

In [5]:
# Retrieve latest news title
latest_title = news_titles[0].find('div', class_='content_title')
news_title = latest_title.text.strip()
print(news_title)

# Retrieve corresponding latest news paragraph
latest_paragrapph = news_titles[0].find('div', class_='rollover_description_inner')
news_p = latest_paragrapph.text.strip()
print(news_p)

NASA Will Inspire World When It Returns Mars Samples to Earth in 2033
This advanced mission architecture will include two sample recovery helicopters.


### JPL Mars Space Images - Featured Image

In [27]:
## 1) open URL via splinter to navigate to url
## 2) use beautiful soup to parse page and obtain full size image .jpg
## 3) use splinter to navigate by clicking full image

## 1) open URL via splinter in prep to navigate to full image =========================
# Import splinter and set up the chromedriver path
from splinter import Browser

# Set url
mars_space_images_url = 'https://spaceimages-mars.com'

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(mars_space_images_url)

# Setup BeautifulSoup
html = browser.html
fullimage_html = BeautifulSoup(html, "html.parser")

# Find link to full image page
fullimage_url_link = fullimage_html.find('img', class_='headerimage fade-in').get('src')
featured_image_url = (f'{mars_space_images_url}/{fullimage_url_link}')
print(f'featured_image_url = {featured_image_url}')


## 3) use splinter to navigate by clicking full image ==================================
# This was not required, as above link works to provide full image.
# button_by_text = browser.find_by_text(' FULL IMAGE')
# button_by_text.click()

## 4) Quit browser
browser.quit()


featured_image_url = https://spaceimages-mars.com/image/featured/mars2.jpg


### Mars Facts

In [2]:
# Dependencies
import pandas as pd

In [15]:
url_mars_facts = 'https://galaxyfacts-mars.com/'
tables = pd.read_html(url_mars_facts)
# print(tables)

# From type(tables), understand tables returned as list.append
# Want 2nd table:
mars_facts_df = tables[1]
# mars_facts_df.head()

# Rename columns
mars_facts_df.columns = ["Profile_component", "Value"]
mars_facts_df

# Convert df to HTML table string, using pandas
mars_facts_html = mars_facts_df.to_html
mars_facts_html

<bound method DataFrame.to_html of       Profile_component                          Value
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
3                Moons:          2 ( Phobos & Deimos )
4       Orbit Distance:       227,943,824 km (1.38 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                   -87 to -5 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers>

### Mars Hemispheres

In [176]:
# Dependencies
from splinter import Browser

executable_path = {'executable_path': ChromeDriverManager().install()}
browser_hemis = Browser('chrome', **executable_path, headless=False)
url_hemis = 'https://marshemispheres.com/'
browser_hemis.visit(url_hemis)


## Identify the full resolution links for all pictures
# 1) Setup BeautifulSoup
html_hemi = browser_hemis.html
soup_hemi = BeautifulSoup(html_hemi, "html.parser")







[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [None]:
# Search on description, as this is unique to the 4 pictures/links.
images = soup_hemi.find_all('div', class_="description")

# To capture each iteration
hemi_dict = {}

# To append into dictionary
hemisphere_image_url = []


## 2) Iterate through each link from base url:
for image in images:
    # Scrape titles of each photo
    title = image.find('h3').text
    # print(title)
    
    # Obtain url link for image
    url_hemi_images = image.find('a')['href']
    # Construct full link to 
    img_url_ind = f'{url_hemis}{url_hemi_images}'
    # print(img_url_ind)    
    
    # Setup splinter in each individual hemi page
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser_hemis_ind = Browser('chrome', **executable_path, headless=True)
    browser_hemis_ind.visit(img_url_ind)
    
    # Setup BS
    html_img_ind = browser_hemis_ind.html
    soup_img_ind = BeautifulSoup(html_img_ind, "html.parser")
    
    # Now find full image URL from this page
    image_url_ind = soup_img_ind.find_all('img', class_="wide-image")
    url_ind = image_url_ind[0]['src']
    
    # Create full image url with base url
    img_url = f'{url_hemis}{url_ind}'
    # print(img_url)
    
    # Add title to dictionary
    hemi_dict = {"title": title, "img_url": img_url}
    
    # Append each interation so not overwritten
    hemisphere_image_url.append(hemi_dict)


hemisphere_image_url