In [1]:
# Import BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

import os
import pandas as pd

In [2]:
# Import Splinter and set the chromedriver path
# from splinter import Browser
# executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless=False)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [C:\Users\Admin\.wdm\drivers\chromedriver\win32\90.0.4430.24\chromedriver.exe] found in cache


## Step 1 - Scraping

### NASA Mars Latest News

In [3]:
# Visit the NASA news page indicated by instructions
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [4]:
# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# print(soup)

In [5]:
# Retrieve the latest element that contains news title and news_paragraph
# results = soup.find('div', class_='content_title')

# After checking documentation, I use "select" function instead of "find". 
# It gives you a different way navigating through an HTML tree 
# using the CSS selectors which have easier syntax.
results = soup.select_one("ul.item_list li.slide")
results

<li class="slide"><div class="image_and_description_container"><a href="/news/8942/nasas-ingenuity-mars-helicopter-completes-first-one-way-trip/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">The Red Planet rotorcraft headed south in support of furthering research into the potential use of aerial scouts on Mars in the future.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="NASA’s Ingenuity Mars Helicopter’s fifth flight was captured on May 7, 2021, by one of the navigation cameras aboard the agency’s Perseverance rover. This was the first time it flew to a new landing site." src="/system/news_items/list_view_images/8942_PIA24647-320x240.jpg"/></div><div class="bottom_gradient"><div><h3>NASA's Ingenuity Mars Helicopter Completes First One-Way Trip  </h3></div></div></a><div class="list_text"><div class="list_date">May  7, 2021</div><div class="content_title"><a href=

In [6]:
   # scrape the article title 
    news_title = results.find('div', class_='content_title').text
    
    # scrape the article subheader / paragraph
    news_paragraph = results.find('div', class_='article_teaser_body').text
    
    # print latest title and paragraph
    print('---------------------------------------------------------------------')
    print(f"Latest title: {news_title}")
    print('---------------------------------------------------------------------')
    print(news_paragraph)

---------------------------------------------------------------------
Latest title: NASA's Ingenuity Mars Helicopter Completes First One-Way Trip  
---------------------------------------------------------------------
The Red Planet rotorcraft headed south in support of furthering research into the potential use of aerial scouts on Mars in the future.


### JPL Mars Space Images - Featured Image

In [7]:
# Visit Mars Space Images through splinter module
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [8]:
# HTML Object 
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# find full resolution image with BeautifulSoup
results = soup.find_all('img', class_="headerimage fade-in")
image_title = soup.find('h1', class_="media_feature_title").text.strip()
# Loop in soup results
for result in results:
#     Chech src as src attribute specifies the URL of the image
      featured_image_url = result['src']

# Base Website Url 
main_url = 'https://spaceimages-mars.com/'

# Full url as base + extension website url obtained with scrapped route
featured_image_url = main_url + featured_image_url

# Display full link to featured image
print(f'featured_image_url:  ' + featured_image_url)

featured_image_url:  https://spaceimages-mars.com/image/featured/mars2.jpg


### Mars Facts

In [10]:
# Visit Mars Space Images through splinter module
url = 'https://galaxyfacts-mars.com/'
browser.visit(url)

In [11]:
# We can see after read html with pandas that there are 2 tables
# index 0 is the mars/earth comparison
# index 1 is the one containing mars facts
extraction_df = pd.read_html(url)
# table[1]
extraction_df

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [12]:
# make a dataframe with table[1] for mars facts for required html  
mars_facts_df = extraction_df[1]
mars_facts_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [13]:
# Chack columns names
mars_facts_df.columns

Int64Index([0, 1], dtype='int64')

In [14]:
# rename columns ['Mars facts', 'Data/Description']
mars_facts_df.rename(columns={0:'Mars facts', 1:'Data/Description'}, inplace=True)
mars_facts_df

Unnamed: 0,Mars facts,Data/Description
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [15]:
# create an html from dataframe dropping index
mars_fact_html = mars_facts_df.to_html(index=False)
mars_fact_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Mars facts</th>\n      <th>Data/Description</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 ( Phobos &amp; Deimos )</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [16]:
# write html to file
# Note: The HTML file 'mars_detailed_facts.html' will be created 
# with HTML data in the current working directory
text_file = open("mars_detailed_facts.html", "w", encoding='utf-8')
text_file.write(mars_fact_html)
text_file.close()

In [17]:
# decided to save 2nd table too as will be needed for next mars scrape exercise
# Extraction and save 2nd table for comparison as will be needed for scrape function later
extraction_df = pd.read_html(url)
# make a dataframe with table[0] for mars earth comparison for required html  
mars_earth_df = extraction_df[0]
mars_earth_df.rename(columns={0:'Mars - Earth Comparison', 1:'Mars', 2:"Earth"}, inplace=True)
# drop 1st row. By selecting all rows from first row onwards
mars_earth_df = mars_earth_df.iloc[1: , :]

# mars_fact_html = mars_facts_df.to_html(index=False)
mars_earth_comparison_html = mars_earth_df.to_html(index=False)
# print(mars_earth_comparison_html)

In [18]:
# write html to file
# Note: The HTML file 'mars_detailed_facts.html' will be created 
# with HTML data in the current working directory
text_file = open("mars_earth_comparison.html", "w", encoding='utf-8')
text_file.write(mars_earth_comparison_html)
text_file.close()

### Mars Hemispheres

In [19]:
# visit url
url = "https://marshemispheres.com/"
browser.visit(url)

In [20]:
# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

In [21]:
# Get results finding through div and class
results = soup.find_all('div', class_="collapsible results")
# results

### Titles

In [22]:
# create empty list for titles/names
hemi_names = []

# Get names of all four hemispheres
results = soup.find_all('div', class_="collapsible results")
hemispheres = results[0].find_all('h3')

# Get text and store in list
for name in hemispheres:
    hemi_names.append(name.text)

hemi_names

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

### Images

In [23]:
# Search for thumbnail links
thumbnail_results = results[0].find_all('a')
thumbnail_links = []
base_url = "https://marshemispheres.com/"

for thumbnail in thumbnail_results:
    
    # If the thumbnail element has an image...
    if (thumbnail.img):
        
        # then grab the attached link
#         compose full name with base url + extension obtained
        thumbnail_url = base_url + thumbnail['href']
        
        # Append list with links
        thumbnail_links.append(thumbnail_url)

thumbnail_links

['https://marshemispheres.com/cerberus.html',
 'https://marshemispheres.com/schiaparelli.html',
 'https://marshemispheres.com/syrtis.html',
 'https://marshemispheres.com/valles.html']

In [24]:
# create empty list for full resolution images
full_imgs = []

# loop in thumbnail links
for url in thumbnail_links:
    
    # Click through each thumbnail link
    browser.visit(url)
    
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape each page for the relative image path
    results = soup.find_all('img', class_='wide-image')
    relative_img_path = results[0]['src']
    
    # Combine the base + image path to get the full url
    img_link = base_url + relative_img_path
    
    # Add full image links to a list
    full_imgs.append(img_link)
    
for image in full_imgs:
    print(image)

https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg
https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg
https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg
https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg


### Create a dictionary

In [25]:
# Zip together the list of hemisphere names and hemisphere image links
mars_hemi_zip = zip(hemi_names, full_imgs)

#  create an empty list to store dictionaries created
hemisphere_data = []

# Iterate through the zipped object
for title, img in mars_hemi_zip:
    #create empty dictionary
    mars_hemi_dict = {}
    
    # Add hemisphere title to dictionary
    mars_hemi_dict['title'] = title
    
    # Add image url to dictionary
    mars_hemi_dict['img_url'] = img
    
    print(f"Title:  ", title)
    print(f"Image:  ", img)
    print("-------------------------------------------------")
    
    # Append the list with dictionaries
    hemisphere_data.append(mars_hemi_dict)
    
hemisphere_data


Title:   Cerberus Hemisphere Enhanced
Image:   https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg
-------------------------------------------------
Title:   Schiaparelli Hemisphere Enhanced
Image:   https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg
-------------------------------------------------
Title:   Syrtis Major Hemisphere Enhanced
Image:   https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg
-------------------------------------------------
Title:   Valles Marineris Hemisphere Enhanced
Image:   https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg
-------------------------------------------------


[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [26]:
# close automated splinter browser
browser.quit()