# web-scraping-challenge
    by Diane Scherpereel      November 2019

In [90]:
# Dependencies
from bs4 import BeautifulSoup
import pandas as pd
import pymongo
import requests
from splinter import Browser

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

### Nasa Mars News

In [3]:
# Define database and collection
nasa_db = client.nasa_db
nasa_collection = nasa_db.items

In [42]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
nasa_response = requests.get(url)
# Create BeautifulSoup object and parse
nasa_soup = BeautifulSoup(nasa_response.text, 'html.parser')


In [60]:
# Examine the results, then determine how to get the title and save it to the variable "nasa_news_title"
nasa_news_title_raw = nasa_soup.find(class_='content_title')
nasa_news_title = nasa_news_title_raw.text.strip()
nasa_news_title

'NASA Invites Students to Name Mars 2020 Rover'

In [51]:
# Do the same to determine how to get the paragraph
nasa_paragraph_class = nasa_soup.find(class_="slide")
nasa_paragraph_class

<div class="slide">
<div class="image_and_description_container">
<a href="/news/8508/nasa-invites-students-to-name-mars-2020-rover/">
<div class="rollover_description">
<div class="rollover_description_inner">
Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover.
</div>
<div class="overlay_arrow">
<img alt="More" src="/assets/overlay-arrow.png"/>
</div>
</div>
<img alt="NASA Invites Students to Name Mars 2020 Rover" class="img-lazy" data-lazy="/system/news_items/list_view_images/8508_Name_A_unannotated_MAIN-th.jpg" src="/assets/loading_320x240.png"/>
</a>
</div>
<div class="content_title">
<a href="/news/8508/nasa-invites-students-to-name-mars-2020-rover/">
NASA Invites Students to Name Mars 2020 Rover
</a>
</div>
</div>

In [59]:
# Find and save the paragraph to the variable "nasa_paragraph"
nasa_paragraph_raw = nasa_soup.find(class_='rollover_description_inner')
nasa_paragraph = nasa_paragraph_raw.text.strip()
nasa_paragraph

"Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover."

### JPL Mars Space Images

In [218]:
# Set up a chrome browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [219]:
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(jpl_url)

In [220]:
# Create BeautifulSoup object and parse
jpl_html = browser.html
jpl_soup = BeautifulSoup(jpl_html, 'lxml')
# Select the part that contains the image urls
featured_image = jpl_soup.select('li.slide a.fancybox')
featured_image

[<a class="fancybox" data-description="This image from NASAs Mars Odyssey shows an unnamed crater located in Arabia Terra. The small crater seen in the image is located on the floor of a much larger crater." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/largesize/PIA23537_hires.jpg" data-link="/spaceimages/details.php?id=PIA23537" data-thumbnail="/spaceimages/images/wallpaper/PIA23537-640x350.jpg" data-title="Terraced Wall Crater">
 <div class="image_and_description_container">
 <div class="rollover_description">
 <h3 class="release_date">November 5, 2019</h3>
 <div class="item_tease_overlay">Terraced Wall Crater</div>
 <div class="overlay_arrow">
 <img alt="more arrow" src="/assets/images/overlay-arrow.png"/>
 </div>
 </div>
 <div class="img">
 <img alt="Terraced Wall Crater" class="thumb" src="/spaceimages/images/wallpaper/PIA23537-640x350.jpg" title="Terraced Wall Crater"/>
 </div>
 <div class="list_text_content">
 <div class="article_teaser_body">November 5, 

In [222]:
# Make a list of just the data-fancybox-hrefs
img_list = [i.get('data-fancybox-href') for i in featured_image]
img_list

['/spaceimages/images/largesize/PIA23537_hires.jpg',
 '/spaceimages/images/largesize/PIA23536_hires.jpg',
 '/spaceimages/images/largesize/PIA23525_hires.jpg',
 '/spaceimages/images/largesize/PIA23524_hires.jpg',
 '/spaceimages/images/largesize/PIA23523_hires.jpg',
 '/spaceimages/images/largesize/PIA23522_hires.jpg',
 '/spaceimages/images/largesize/PIA23521_hires.jpg',
 '/spaceimages/images/largesize/PIA23213_hires.jpg',
 '/spaceimages/images/largesize/PIA23510_hires.jpg',
 '/spaceimages/images/largesize/PIA23378_hires.jpg',
 '/spaceimages/images/largesize/PIA23509_hires.jpg',
 '/spaceimages/images/largesize/PIA23508_hires.jpg',
 '/spaceimages/images/largesize/PIA23507_hires.jpg',
 '/spaceimages/images/largesize/PIA23530_hires.jpg',
 '/spaceimages/images/largesize/PIA23529_hires.jpg',
 '/spaceimages/images/largesize/PIA23528_hires.jpg',
 '/spaceimages/images/largesize/PIA23527_hires.jpg',
 '/spaceimages/images/largesize/PIA23506_hires.jpg',
 '/spaceimages/images/largesize/PIA23505_hires

In [223]:
# Put just the base part of the webpage into a variable
img_base_url = 'https://www.jpl.nasa.gov'
# Combine the base url with the first img url from img_list
featured_image_url = img_base_url + img_list[0] 
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA23537_hires.jpg'

### Mars Weather

In [83]:
# Define database and collection
mars_weather_db = client.mars_weather_db
mars_weather_collection = mars_weather_db.items

In [86]:
# URL of page to be scraped
mars_weather_url = 'https://twitter.com/marswxreport?lang=en'

# Retrieve page with the requests module
mars_weather_response = requests.get(mars_weather_url)
# Create BeautifulSoup object and parse
mars_weather_soup = BeautifulSoup(mars_weather_response.text, 'html.parser')

In [89]:
# Find and save the latest tweet for Mars weather
mars_weather_raw = mars_weather_soup.find(class_='TweetTextSize')
mars_weather = mars_weather_raw.text.strip()
mars_weather

'InSight sol 334 (2019-11-04) low -100.0ºC (-148.1ºF) high -23.8ºC (-10.8ºF)\nwinds from the SSW at 5.1 m/s (11.4 mph) gusting to 19.9 m/s (44.4 mph)\npressure at 7.00 hPapic.twitter.com/D4EX1MROay'

### Mars Facts

In [91]:
# Label the mars facts url
mars_facts_url = 'https://space-facts.com/mars/'

In [93]:
# Use pandas to read the mars facts table
mars_facts_table = pd.read_html(mars_facts_url)
mars_facts_table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:    -153 to 20 °C      -88 to 58°C,
           

In [94]:
# Check the type of the mars_facts_table
print(type(mars_facts_table))

<class 'list'>


In [230]:
# Put the table into a pandas dataframe
mars_facts_db1 = mars_facts_table[0]

# Set index to the 0 column
mars_facts_db1.set_index(0, inplace=True)

# Delete the index name ('0')
mars_facts_db1.index.names = [None]

# Delete the column name ('1')
mars_facts_db1.columns = ['']

mars_facts_db1

Unnamed: 0,Unnamed: 1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [233]:
# Convert the pandas dataframe to HTML table string
mars_facts_html_table = mars_facts_db1.to_html()
mars_facts_html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [234]:
# Clean up the table by getting rid of these \n
mars_facts_html_table = mars_facts_html_table.replace('\n', '')
mars_facts_html_table

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [99]:
# There is also a comparison table between Mars and Earth - interesting but won't be used in the rest of this project. 
mars_earth_comparison = mars_facts_table[1]
mars_earth_comparison

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


### Mars Hemispheres

In [215]:
# Visit the USGS Astrogeology site to obtain high resolution images for each of Mars hemispheres
usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(usgs_url)

# Create BeautifulSoup object and parse
soup = BeautifulSoup(browser.html, 'html.parser')

In [216]:
# Get the 4 hemispheres from div.item
hemispheres = soup.select('div.item')
hemispheres

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiapa

In [217]:
# Loop through each hemisphere

hemisphere_image_urls = []

for h in hemispheres:
    title = (h.find('h3').text).replace(' Enhanced', '')
        
    # click the hemisphere
    browser.click_link_by_partial_text(title)
    
    # make new soup of that page
    soup = BeautifulSoup(browser.html, 'html.parser')
    
    # find the full image
    full = soup.find('a', text='Sample')
    
    # get the img url
    img_url = full['href']
    
    # make a dictionary and append to the list
    hemisphere_image_urls.append({'title': title, 'img_url': img_url})
    
    # go back 
    browser.back()

# close the browser
browser.quit()    

hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]