In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)  

In [3]:
# Visit https://redplanetscience.com/
url = 'https://redplanetscience.com/'
browser.visit(url)

time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser") 

In [4]:
# Get the News Title
news_title = soup.find('div', class_='content_title').text

In [5]:
# Get the title intro
news_intro = soup.find('div', class_='article_teaser_body').text

In [6]:
# Visit https://spaceimages-mars.com/
url = 'https://spaceimages-mars.com/'
browser.visit(url)

time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

In [7]:
# Find the featured image
featured_image = soup.find('img', class_='headerimage')

In [8]:
# Save the url to the image
featured_image_url = url + featured_image['src']

In [9]:
# Move on to the next website
url = 'https://galaxyfacts-mars.com/'

In [10]:
# Read in the tables from the site
tables = pd.read_html(url)

In [11]:
# Save the desired table
mars_facts = tables[0]

In [12]:
# Set the column names correctly
mars_facts.columns = mars_facts.iloc[0]

In [13]:
# Drops the row used as the columns
mars_facts.drop(mars_facts.index[0], inplace=True)

In [16]:
mars_facts.rename(columns={'Mars - Earth Comparison' : ''}, inplace=True)

In [17]:
mars_facts

Unnamed: 0,Unnamed: 1,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [18]:
mars_facts.set_index("", inplace=True)

In [19]:
mars_facts

Unnamed: 0,Mars,Earth
,,
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [20]:
# Save the table to html without index so it looks nicer
mars_facts_html = mars_facts.to_html(classes=["table", "table-bordered", "table-striped", "table-hover", "col-lg-12"])

In [21]:
from pprint import pprint

In [22]:
pprint(mars_facts_html)

('<table border="1" class="dataframe table table-bordered table-striped '
 'table-hover col-lg-12">\n'
 '  <thead>\n'
 '    <tr style="text-align: right;">\n'
 '      <th></th>\n'
 '      <th>Mars</th>\n'
 '      <th>Earth</th>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th></th>\n'
 '      <th></th>\n'
 '      <th></th>\n'
 '    </tr>\n'
 '  </thead>\n'
 '  <tbody>\n'
 '    <tr>\n'
 '      <th>Diameter:</th>\n'
 '      <td>6,779 km</td>\n'
 '      <td>12,742 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Mass:</th>\n'
 '      <td>6.39 × 10^23 kg</td>\n'
 '      <td>5.97 × 10^24 kg</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Moons:</th>\n'
 '      <td>2</td>\n'
 '      <td>1</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Distance from Sun:</th>\n'
 '      <td>227,943,824 km</td>\n'
 '      <td>149,598,262 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Length of Year:</th>\n'
 '      <td>687 Earth days</td>\n'
 '      <td>365.24 days</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '

In [None]:
# Next website
url = 'https://marshemispheres.com/'

In [None]:
browser.visit(url)

time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

In [None]:
# variables list for getting links and titles
product_list = []
url_list = []
product_title = []
product_url_list = []
hemisphere_image_urls = []

In [None]:
# get each hemisphere page available
product_items = soup.find_all('div', class_='description')

In [None]:
# loop to get each hemisphere page available
for product in product_items:
    title = product.find('h3').text
    product_title.append(title)
    product_url = product.find('a')['href']
    url_list.append(product_url)

product_url_list = [ url + p_url for p_url in url_list]

In [None]:
browser.links.find_by_partial_text(product_title[0]).click()
html = browser.html
soup = bs(html, "html.parser")

In [None]:
# Ensure the material is still there
try:
    for title in product_title:
        browser.links.find_by_partial_text(title).click()
        html = browser.html
        soup = bs(html, "html.parser")
        title = soup.find('h2', class_='title').text
        image = soup.find('div', id='wide-image').find_all('img')[1]['src']
        
        hemisphere_image_urls.append({ 'title' : title, 'img_url' : url + image})
        browser.links.find_by_partial_text('Back').click()
        
except:
    print("Scraping Complete")

In [None]:
# Store data in a dictionary
mars_data_dict = {
    "Featured_img": featured_image_url,
    "NewsTitle": news_title,
    "NewsIntro": news_intro,
    "Mars_Facts" : mars_facts_html,
    "Hemisphere_Imgs" : hemisphere_image_urls
}

In [None]:
# Close the browser after scraping
browser.quit()

In [None]:
mars_data_dict

In [None]:
for img in mars_data_dict['Hemisphere_Imgs']:
    print(img['img_url'])