# web-scraping-challenge
    by Diane Scherpereel      November 2019

In [90]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pymongo
from splinter import Browser

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

### Nasa Mars News

In [3]:
# Define database and collection
nasa_db = client.nasa_db
nasa_collection = nasa_db.items

In [42]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
nasa_response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
nasa_soup = BeautifulSoup(nasa_response.text, 'html.parser')


In [60]:
# Examine the results, then determine how to get the title and save it to the variable "nasa_news_title"
nasa_news_title_raw = nasa_soup.find(class_='content_title')
nasa_news_title = nasa_news_title_raw.text.strip()
nasa_news_title

'NASA Invites Students to Name Mars 2020 Rover'

In [51]:
# Do the same to determine how to get the paragraph
nasa_paragraph_class = nasa_soup.find(class_="slide")
nasa_paragraph_class

<div class="slide">
<div class="image_and_description_container">
<a href="/news/8508/nasa-invites-students-to-name-mars-2020-rover/">
<div class="rollover_description">
<div class="rollover_description_inner">
Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover.
</div>
<div class="overlay_arrow">
<img alt="More" src="/assets/overlay-arrow.png"/>
</div>
</div>
<img alt="NASA Invites Students to Name Mars 2020 Rover" class="img-lazy" data-lazy="/system/news_items/list_view_images/8508_Name_A_unannotated_MAIN-th.jpg" src="/assets/loading_320x240.png"/>
</a>
</div>
<div class="content_title">
<a href="/news/8508/nasa-invites-students-to-name-mars-2020-rover/">
NASA Invites Students to Name Mars 2020 Rover
</a>
</div>
</div>

In [59]:
# Find and save the paragraph to the variable "nasa_paragraph"
nasa_paragraph_raw = nasa_soup.find(class_='rollover_description_inner')
nasa_paragraph = nasa_paragraph_raw.text.strip()
nasa_paragraph

"Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover."

### JPL Mars Space Images

In [62]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [63]:
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(jpl_url)

In [121]:
# After clicking the button that says "Full Image", it leads to the screen with this link below.
jpl_html = browser.html
jpl_soup = BeautifulSoup(jpl_html, 'lxml')
featured_image_url_full_test2 = jpl_soup.find(class_='fancybox-image')
featured_image_url_full_test2

<img class="fancybox-image" src="/spaceimages/images/mediumsize/PIA19101_ip.jpg" style="display: inline;"/>

In [111]:
# After clicking the button that says "Full Image", it leads to the screen with this link below.
jpl_html = browser.html
jpl_soup = BeautifulSoup(jpl_html, 'lxml')
featured_image_url_full_test = jpl_soup.find(class_='fancybox-image')
featured_image_url_full_test

<img class="fancybox-image" src="/spaceimages/images/mediumsize/PIA19101_ip.jpg" style="display: inline;"/>

In [130]:
# Import Regex for search
import re
test3 = jpl_soup.find('img',{'src':re.compile('.*(jpg).*')})
test3

<img alt="Aromatum Chaos" class="thumb" src="/spaceimages/images/wallpaper/PIA23523-640x350.jpg" title="Aromatum Chaos"/>

In [133]:
test3.text.strip()
test3

<img alt="Aromatum Chaos" class="thumb" src="/spaceimages/images/wallpaper/PIA23523-640x350.jpg" title="Aromatum Chaos"/>

In [115]:
# returned an empty string
#jpl_featured_image_url = featured_image_url_full_test.text
#jpl_featured_image_url

''

### Mars Weather

In [83]:
# Define database and collection
mars_weather_db = client.mars_weather_db
mars_weather_collection = mars_weather_db.items

In [86]:
# URL of page to be scraped
mars_weather_url = 'https://twitter.com/marswxreport?lang=en'

# Retrieve page with the requests module
mars_weather_response = requests.get(mars_weather_url)
# Create BeautifulSoup object; parse with 'lxml'
mars_weather_soup = BeautifulSoup(mars_weather_response.text, 'html.parser')

In [89]:
# Find and save the latest tweet for Mars weather
mars_weather_raw = mars_weather_soup.find(class_='TweetTextSize')
mars_weather = mars_weather_raw.text.strip()
mars_weather

'InSight sol 334 (2019-11-04) low -100.0ºC (-148.1ºF) high -23.8ºC (-10.8ºF)\nwinds from the SSW at 5.1 m/s (11.4 mph) gusting to 19.9 m/s (44.4 mph)\npressure at 7.00 hPapic.twitter.com/D4EX1MROay'

### Mars Facts

In [91]:
# Label the mars facts url
mars_facts_url = 'https://space-facts.com/mars/'

In [93]:
# Use pandas to read the mars facts table
mars_facts_table = pd.read_html(mars_facts_url)
mars_facts_table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:    -153 to 20 °C      -88 to 58°C,
           

In [94]:
# Check the type of the mars_facts_table
print(type(mars_facts_table))

<class 'list'>


In [97]:
# Put the table into a pandas dataframe
mars_facts_db1 = mars_facts_table[0]
mars_facts_db1

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [99]:
# There is also a comparison table between Mars and Earth - interesting but won't be used in the rest of this project. 
mars_earth_comparison = mars_facts_table[1]
mars_earth_comparison

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


In [None]:
# Convert to html yet

### Mars Hemispheres