In [11]:
#import dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import requests

##Scraping

#NASA Mars News

In [12]:
#url and access it:
url = 'https://mars.nasa.gov/news/?page=0'
html = requests.get(url)

In [13]:
#use BeautifulSoup to convert to text and parse it:
soup = BeautifulSoup(html.text, 'html.parser')

In [14]:
#review of the html finds that the titles are contained within the "content_title" class, as we need the latest, conduct a .find (instead of find_all):
title_results = soup.find('div', class_='content_title')
title_results

<div class="content_title">
<a href="/news/8716/nasa-to-broadcast-mars-2020-perseverance-launch-prelaunch-activities/">
NASA to Broadcast Mars 2020 Perseverance Launch, Prelaunch Activities
</a>
</div>

In [15]:
#the title is contained within the a tag; conver to a string and strip the carriage returns:
title = title_results.find('a')
news_title = title.string.strip()
news_title

'NASA to Broadcast Mars 2020 Perseverance Launch, Prelaunch Activities'

In [16]:
#review of the html finds that the text associated with the titles is located within the "rollver_description_inner" class, as we need the latest, conduct a find instead of find_all
para_results = soup.find('div',class_='rollover_description_inner')
para_results

<div class="rollover_description_inner">
Starting July 27, news activities will cover everything from mission engineering and science to returning samples from Mars to, of course, the launch itself.
</div>

In [17]:
#invoke the .string and .strip() functions to produce just the text:
news_p = para_results.string.strip()
news_p

'Starting July 27, news activities will cover everything from mission engineering and science to returning samples from Mars to, of course, the launch itself.'

##JPL Mars Space Images - Featured Images

In [18]:
#activate a new Chrome browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\cohnj\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache
 


In [19]:
#go to the base url and then click the button w/ class fancybox (the full image button)
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)

target = 'a[class="button fancybox"]'
browser.find_by_tag(target).click()


In [20]:
#then click the more info button
browser.find_link_by_partial_text('more info').click()


In [21]:
#store this url
html2 = requests.get(browser.url)
browser.quit()

In [22]:
#use Beatiful Soup to traverse the html
soup2 = BeautifulSoup(html2.text,'html.parser')
#soup2


In [23]:
#the url for the image is within the figure under the lede class:
img = soup2.find('figure', class_='lede')
img

<figure class="lede">
<a href="/spaceimages/images/largesize/PIA17793_hires.jpg"><img alt="Beam Wave Guide antennas located at Goldstone, CA, known as the 'Beam Waveguide Cluster'. " class="main_image" src="/spaceimages/images/largesize/PIA17793_hires.jpg" title="Beam Wave Guide antennas located at Goldstone, CA, known as the 'Beam Waveguide Cluster'. "/></a>
</figure>

In [24]:
#obtain the href from the anchor:
img_link = img.find('a')['href']
print(img_link)

/spaceimages/images/largesize/PIA17793_hires.jpg


In [25]:
#the url for the largesize image is:
final_img = f'https://www.jpl.nasa.gov{img_link}'
final_img

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17793_hires.jpg'

##Mars Facts

In [26]:
table_url='https://space-facts.com/mars/'


In [27]:
#note: I had to pip install lxml into my PythonData environment:
#use pandas' read_html function to obtain the tabular data from the url
table = pd.read_html(table_url)
table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [29]:
#the table is a list; only the 0th element of the list is necessary:
df = pd.DataFrame(table[0])
df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [30]:
#convert the dataframe to html tabular format:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    <

##Mars Hemispheres

In [31]:
#base url to access the four hemisphere images:
hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [32]:
html3 = requests.get(hemisphere_url)


In [33]:
#use Beautiful Soup to parse the page:
hem_soup = BeautifulSoup(html3.text,'html.parser')

In [34]:
#the partial hemisphere links are contained within anchor tags associated with the class "itemlink product-item" within a div with class "item":
hem_results = hem_soup.find_all('div', class_='item')
print(hem_results)

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/><div class="description"><h3>Cerberus Hemisphere Enhanced</h3></div></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div>, <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png"/><div class="description"><h3>Schiaparelli Hemisphere Enhanced</h3></div></a><span

In [36]:
#for loop to traverse the results, finding the anchor containing the href:
hemisphere_image_urls = []


for hem_result in hem_results:
    
    hem_items = {}

    #create the link to access each hemisphere image's page:
    base_url = 'https://astrogeology.usgs.gov'
    link = hem_result.find('a')['href']
    hem_url = f'{base_url}{link}'
    
    #open a browser using Splinter and then go to the page containing the full image and title, store       the browser information to be used in Beautiful Soup and the close the browser:
    executable_path2 = {'executable_path': ChromeDriverManager().install()}
    browser2 = Browser('chrome', **executable_path, headless=False)
    
    browser2.visit(hem_url)
    browser2.find_link_by_partial_text('Open').click()
    html4 = requests.get(browser2.url)
    browser2.quit()

    #find the title from the html, strip it to clean the tags and then split to remove the text after       the pipe:
    soup3 = BeautifulSoup(html4.text,'html.parser')
    h_title = soup3.find('title')
    hem_title = h_title.string.strip()
    hemisphere_title = hem_title.split(' |')[0]
    hemisphere_title

    #the first item in the list contains the link to the full size image:
    h_li = soup3.find('li')
    hem_li = h_li.a['href']
    hem_li

    #add the title and img_url dictionary to the list of urls:
    hem_items = {'title': hemisphere_title, 'img_url': hem_li} 
    hemisphere_image_urls.append(hem_items)




[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\cohnj\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\cohnj\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\cohnj\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache
 
[WDM] - Current google-chrome version is 86.0.4240
[WDM] - Get LATEST driver version for 86.0.4240
[WDM] - Driver [C:\Users\cohnj\.wdm\drivers\chromedriver\win32\86.0.4240.22\chromedriver.exe] found in cache
 


In [37]:
#print out of the dictionary containing the image titles and urls:
hemisphere_image_urls



[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]