In [1]:
# Import dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

### NASA Mars News

In [3]:
# URL of page to be scraped
news_url = 'https://redplanetscience.com/'

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [/Users/ellengrove/.wdm/drivers/chromedriver/mac64/101.0.4951.41/chromedriver] found in cache


In [4]:
browser.visit(news_url)

In [5]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [6]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='list_text')

# Create empty lists to store information from articles
titles = []
para_text = []

# Loop through each item in the results list and append results to lists
for item in results:
    title = item.find(class_='content_title').text
    paragraph = item.find(class_='article_teaser_body').text
    titles.append(title)
    para_text.append(paragraph)

### Mars Facts

In [7]:
# URL for the table we are scraping using Pandas
table_url = 'https://galaxyfacts-mars.com/'

# Use Pandas to read html and store first table
tables = pd.read_html(table_url)
mars_facts = tables[0]

In [8]:
# Set correct column names
mars_facts.columns = mars_facts.loc[0]

In [9]:
# Drop first row with column names and reset index
mars_facts.drop(mars_facts.index[0],inplace=True)
mars_facts.reset_index(inplace=True,drop=True)
mars_facts

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


### JPL Mars Space Images—Featured Image

In [10]:
# URL to scrape for featured image
img_url = 'https://spaceimages-mars.com/'

browser.visit(img_url)

In [11]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [12]:
# Identify parent of featured image
links = soup.find_all('a', class_='showimg fancybox-thumbs')
for link in links:
    featured_img_url = img_url + link['href']

### Mars Hemispheres

In [13]:
hemisphere_url = 'https://marshemispheres.com/'

browser.visit(hemisphere_url)

In [14]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [15]:
# Empty list to store image titles and urls
hemisphere_img_urls = []

images = soup.find_all('div', class_='item')

for image in images:
    # Store title of image
    title = image.h3.text
    # Construct the url for each hemisphere page and initialize new soup with url
    image_url = hemisphere_url + image.a['href']
    browser.visit(image_url)
    sub_html = browser.html
    sub_soup = BeautifulSoup(sub_html, 'html.parser')
    # Locate the full-size image and store in variable hem_image_url
    sub_images = sub_soup.find_all('a', target='_blank')
    sub_image_url = sub_images[2]['href']
    hem_image_url = hemisphere_url + sub_image_url
    # Store title and image url in a dictionary
    image_dict = {'title':title, 'img_url' : hem_image_url}
    # Append dictionary to list
    hemisphere_img_urls.append(image_dict)
    
hemisphere_img_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]