In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
from splinter import Browser
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager


In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\61430\.wdm\drivers\chromedriver\win32\99.0.4844.51]


In [3]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [4]:
# Define database and collection
db = client.mars
collection = db.articles

# NASA MARS NEWS

In [5]:
# URL of page to be scraped
url = 'https://redplanetscience.com/'
browser.visit(url)
html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html, 'html.parser')

In [6]:
# Retrieve the parent divs for all articles
heading = soup.find('div', class_='content_title').get_text()
print(heading)



Media Get a Close-Up of NASA's Mars 2020 Rover


In [7]:
p_text = soup.find("div", class_="article_teaser_body").get_text()
print(p_text)

The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.


# JPL Mars Space Images - Featured Image

In [8]:
#URL of page to be scraped and splinter
url = "https://spaceimages-mars.com/"
browser.visit(url)
html = browser.html

#soup it and find image class
soup = BeautifulSoup(html, "html.parser")
image = soup.find("img", class_="headerimage")["src"]
featured_image_url = "https://spaceimages-mars.com/" + image
featured_image_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

# Mars Facts

In [9]:
#diameter and masses from galaxy
facts = pd.read_html("https://galaxyfacts-mars.com/")[0]
print(facts)
facts.reset_index(inplace=True)
facts.columns=["ID", "Profile_Point", "Mars", "Earth"]
facts

                         0                1                2
0  Mars - Earth Comparison             Mars            Earth
1                Diameter:         6,779 km        12,742 km
2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
3                   Moons:                2                1
4       Distance from Sun:   227,943,824 km   149,598,262 km
5          Length of Year:   687 Earth days      365.24 days
6             Temperature:     -87 to -5 °C      -88 to 58°C


Unnamed: 0,ID,Profile_Point,Mars,Earth
0,0,Mars - Earth Comparison,Mars,Earth
1,1,Diameter:,"6,779 km","12,742 km"
2,2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,3,Moons:,2,1
4,4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,5,Length of Year:,687 Earth days,365.24 days
6,6,Temperature:,-87 to -5 °C,-88 to 58°C


# Mars Hemispheres

In [10]:
browser.visit('https://marshemispheres.com/')
html = browser.html
soup = BeautifulSoup(html, 'html.parser')



In [11]:
#create hemi dict
hemispheres = []

#soup the required areas and add to dict
results = soup.find_all('div', class_="collapsible results")
hemi = results[0].find_all('h3')

# loop through text and store in list
for name in hemi:
    hemispheres.append(name.text)

hemispheres 

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [12]:
#search for each link
link_results = results[0].find_all('a')
links = []

for thumbnail in link_results:
    
    #if result has img included
    if (thumbnail.img):
        
        #get  attached link
        thumbnail_url = 'https://marshemispheres.com/' + thumbnail['href']
        
        links.append(thumbnail_url)

print(links)


['https://marshemispheres.com/cerberus.html', 'https://marshemispheres.com/schiaparelli.html', 'https://marshemispheres.com/syrtis.html', 'https://marshemispheres.com/valles.html']


In [17]:
imgs = []

for url in links:
    
    browser.visit(url) 
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    #scrape for path
    path = soup.find_all('img', class_='wide-image')
    relative_path = path[0]['src']
    
    #combine for full url
    img_link = 'https://marshemispheres.com/' + relative_path
    
    imgs.append(img_link)

imgs

['https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg']

In [21]:
mars_hemi_combined = zip(hemispheres, imgs)

hemisphere_image_urls = []

#loop through 
for title, img in mars_hemi_combined:
    
    mars_dict = {}    
    mars_dict['img_url'] = img
    mars_dict['title'] = title
    
    # Append the list with dictionaries
    hemisphere_image_urls.append(mars_dict)

hemisphere_image_urls


[{'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]