In [None]:
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import time
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

### Scraping Headlings

In [None]:
# url to scrape for headlines
url = 'https://mars.nasa.gov/news/'

# scrape jpl's website
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# access the website and create a bs4 object
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# save top headline and article teaser 
news_title = soup.find_all('div', class_= 'content_title')[1].get_text()
news_p = soup.find_all('div', class_= 'article_teaser_body')[0].get_text()

### Scraping Featured Image

In [None]:
# scrape jpl's website
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [None]:
# click to get to full size image
browser.click_link_by_partial_text('FULL IMAGE')

In [None]:
# second click
# browser.find_link_by_text('more info     ').first.click()
browser.click_link_by_partial_text('more info')

In [None]:
# scrape and find the image url
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

image = soup.find('img', class_='main_image')

In [None]:
# create the image url string
featured_image_url = 'https://www.jpl.nasa.gov' + soup.find('img', class_='main_image').get('src')

In [None]:
featured_image_url

### Scraping Twitter for Current Weather

In [None]:
# scraping twitter to get current Mars weather
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
html = browser.html

soup = BeautifulSoup(html, 'html.parser')

In [None]:
# Mars weather from most recent tweet
try:
    mars_weather = soup.find_all(text=re.compile("InSight"))[0]
except IndexError:
    print("Parse failed")

### Scrape and Create a Mars Facts Table

In [None]:
# scrape to get Mars fact table
url = 'https://space-facts.com/mars/'
tables = pd.read_html(url)

In [None]:
# create a dataframe from the table
mars_df = tables[0]
mars_df.columns = ['info', 'Value']

In [None]:
mars_df = mars_df.set_index('info')

In [None]:
mars_df.index.name = None
mars_df

In [None]:
# convert to HTML and drop any \n
html_table = mars_df.to_html()
html_table = html_table.replace('\n', '')

### Scraping Hemisphere Images

In [None]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

links = soup.find_all('div', class_='item')
hemi_titles = []

for link in links:
    # Getting the hemisphere names and storing them to a list
    hemi_name = link.find('h3').get_text()
    hemi_titles.append(hemi_name)

In [None]:
browser.visit(url)
time.sleep(4)

hemi_imgs = []

for title in range(len(hemi_titles)):
    try:
            browser.click_link_by_partial_text(hemi_titles[title])
    except:
            browser.find_link_by_text('2').first.click()
            browser.click_link_by_partial_text(hemi_titles[title])
    html = browser.html
    soup2 = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup2.find('div', 'downloads')
    hemi_url = hemi_soup.a['href']
    hemi_dict={"title": hemi_titles[title].replace(' Enhanced', ''), 'img_url': hemi_url}
    hemi_imgs.append(hemi_dict)

#### Testing Python code

In [None]:
def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    return Browser('chrome', **executable_path, headless=False)

In [None]:
mars_data = {}
browser = init_browser()

In [None]:
# url to scrape for headlines
url = 'https://mars.nasa.gov/news/'

# access the website and create a bs4 object
# browser = init_browser()

browser.visit(url)
time.sleep(4)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

mars_data['news_title'] = soup.find_all('div', class_= 'content_title')[1].get_text()
mars_data['news_p'] = soup.find_all('div', class_= 'article_teaser_body')[0].get_text()

In [None]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(url)
time.sleep(4)

# click to get to full size image
browser.click_link_by_partial_text('FULL IMAGE')

# second click
browser.click_link_by_partial_text('more info')

# scrape and find the image url
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# create the image url string
mars_data['featured_image_url'] = 'https://www.jpl.nasa.gov' + soup.find('img', class_='main_image').get('src')

In [None]:
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
time.sleep(4)

html = browser.html

soup = BeautifulSoup(html, 'html.parser')

# Mars weather from most recent tweet
mars_data['mars_weather'] = soup.find_all(text=re.compile("InSight"))[0]

In [None]:
tables = pd.read_html('https://space-facts.com/mars/')

# create a dataframe from the table
mars_df = tables[0]
mars_df.columns = ['', 'Value']

# convert to HTML and drop any \n
html_table = mars_df.to_html()
mars_data['html_table'] = html_table.replace('\n', '')

In [None]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

browser.visit(url)
time.sleep(4)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

hemi_links = soup.find_all('div', class_='item')
hemi_titles = []
hemi_imgs = []

for link in hemi_links:
    # Getting the hemisphere names and storing them to a list
    hemi_name = link.find('h3').get_text()
    hemi_titles.append(hemi_name)

# browser.visit(url)
for title in range(len(hemi_titles)):
    try:
            browser.click_link_by_partial_text(hemi_titles[title])
    except:
            browser.find_link_by_text('2').first.click()
            browser.click_link_by_partial_text(hemi_titles[title])
    html = browser.html
    soup2 = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup2.find('div', 'downloads')
    hemi_url = hemi_soup.a['href']
    hemi_dict={"title": hemi_titles[title].replace(' Enhanced', ''), 'img_url': hemi_url}
    hemi_imgs.append(hemi_dict)

mars_data['hemi_imgs'] = hemi_imgs