# Using Web Scraping to Deliver the News About Mars
#### This notebook will provide the following information:
* NASA Mars News
* JPL Mars Space Images
* Mars Weather
* Mars Facts
* Mars Hemispheres

In [1]:
# Dependencies
import pymongo
import os
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser 

#### Setting up Mongo Database for Mars News

In [2]:
# Connect to MongoDB default port
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Defining 'MarsDB' database and collection
mars_db = client.marsDB
collection = mars_db.articles

#### URLs of Webpages to be Scraped

In [3]:
# Mars urls
news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA18904'
weather_url = 'https://twitter.com/marswxreport?lang=en'
facts_url = 'https://space-facts.com/mars/'
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [4]:
# Retrieve pages with the requests module
news_response = requests.get(news_url)
jpl_response = requests.get(jpl_url)
weather_response = requests.get(weather_url)
facts_response = requests.get(facts_url)
hemi_response = requests.get(hemi_url)

In [5]:
# Beautiful Soup objects
news_soup = bs(news_response.text, 'html.parser')
jpl_soup = bs(jpl_response.text, 'html.parser')
weather_soup = bs(weather_response.text, 'html.parser')
facts_soup = bs(facts_response.text, 'html.parser')
hemi_soup = bs(hemi_response.text, 'html.parser')

#### Examining results before determining elements that contain sought info

In [6]:
# Uncomment to examine results

# print(news_soup.prettify())
# print(jpl_soup.prettify())
# print(weather_soup.prettify())
# print(facts_soup.prettify())
# print(hemi_soup.prettify())

## NASA Mars News
#### Latest news and paragraph text from the NASA Mars News Site.

In [54]:
# Determining elements that contain sought info; Retrieving parent divs
news_results = news_soup.find_all('div', class_='content_title')
news_title = news_results[0].text
print(news_title)



NASA Prepares for Moon and Mars With New Addition to Its Deep Space Network




In [None]:
# Loop through returned results
for result in news_results:
    
    # Retrieve news title
    news_title = result.find('div', class_='list_date')
    
    # Access thread's text content
#     news_p = news_title.a.text
#     print(news_p)
    
    # Dictionary to be inserted into MongoDB
    post = {
        'Title': news_title,
#         'Text': news_p, 
    }
    
    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)

## JPL Mars Space Images
#### Featured image from JPL's Mars programme.

In [18]:
# Using Splinter to navigate site and find image url 
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(jpl_url)
html = browser.html
jpl_soup = bs(html, 'html.parser')
url_prefix = 'https://www.jps.nasa.gov'

In [19]:
images_a = jpl_soup.find('figure', class_='lede')
images_b = images_a.find('a')['href']

featured_image_url = url_prefix + images_b
featured_image_url

'https://www.jps.nasa.gov/spaceimages/images/largesize/PIA18904_hires.jpg'

## Mars Weather
#### Mars weather report tweet from Mars Weather twitter account.

In [62]:
# Dependency
import sys

# Using Splinter to navigate site and find image url
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(weather_url)
html = browser.html
weather_soup = bs(html, 'html.parser')

In [63]:
tweets = weather_soup.find_all('div', {'class': 'css-1dbjc4n'})

In [70]:
def get_mars_tweets(weather_soup):
    tweet_list = list()
    tweets = weather_soup.find_all('div', {'class': 'css-1dbjc4n'})
    for tweet in tweets:
        tweet_data = None
        try:
            tweet_data = get_tweet_text(tweet)
        except Exception as e:
            continue
            
        if tweet_data:
            tweets_list.append(tweet_data)
            print('.', end='')
            sys.stdout.flush()
            
    return tweets_list

print(get_mars_tweets)

<function get_mars_tweets at 0x000001C3F6331950>


## Mars Facts
#### General facts about the "Red Planet."

In [88]:
# Dependency
import urllib
import pandas as pd

In [122]:
# Function to loop over all rows for 'tr' tag to obtain text of 'td' cells
mars_facts = []

def get_mars_facts():
    """Finds all 'tr' tags and assigns them to a variable."""
    facts = facts_soup.find_all('tr')[:8]
    for tr in facts:
        """Finds all 'td' tags within each 'tr' tag and assigns it to a variable. Then appends to
        a list."""
        tds = tr.find_all('td')[:8]
        mars_facts.append({
            tds[0].text, tds[1].text,
        })
    print(mars_facts)

get_mars_facts()

[{'Equatorial Diameter:', '6,792 km'}, {'Polar Diameter:', '6,752 km'}, {'Mass:', '6.39 × 10^23 kg (0.11 Earths)'}, {'Moons:', '2 (Phobos & Deimos)'}, {'Orbit Distance:', '227,943,824 km (1.38 AU)'}, {'Orbit Period:', '687 days (1.9 years)'}, {'Surface Temperature: ', '-87 to -5 °C'}, {'First Record:', '2nd millennium BC'}]


In [92]:
facts_df = pd.DataFrame(mars_facts)
facts_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC


In [98]:
# Render pandas dataframe to html table
facts_html = facts_df.to_html()
print(facts_html)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Moons:</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Surface Temperature:</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>7</th>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
  </tbody>
</table>


## Mars Hemispheres
#### High resolution photos for each of Mars' hemispheres.

In [99]:
# Using Splinter to navigate site and find image url 
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(hemi_url)
html = browser.html
hemi_soup = bs(html, 'html.parser')

In [174]:
# Empty list to store hemisphere urls
hemisphere_urls = []
hemi_url_prefix = 'https://astrogeology.usgs.gov'

def get_hemi_urls(hemi_soup):
    """Querying through html code to find image url suffixes"""
    containers = hemi_soup.find('div', {'class': 'container'})
    items = containers.find_all('div', {'class': 'item'})
    
    for item in items:
        """Joining image url strings and hemisphere titles to a list."""
        hemi_url = hemi_url_prefix + item.find('a')['href']
#         hemi_url = hemi_url_prefix + item.find('img')['src']
#         h3 = item.find('h3')
#         hemisphere_urls.append({'Title': h3.text, 'img_url': hemi_url})
        hemisphere_urls.append(hemi_url)
        
    print(hemisphere_urls)

hemi_urls = get_hemi_urls(hemi_soup)
hemi_urls

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']


In [176]:
# For verification
hemisphere_urls

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [232]:
# Empty list to store image urls
hemisphere_image_urls = []
links = hemisphere_urls

executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

for link in links:
    browser.visit(link)
    html = browser.html
    soup = bs(html, 'html.parser')
    
    """Querying each link to find each full resolution image url and its respective title."""
    containers = soup.find('div', {'class': 'container'})
    
    """Searching for urls."""
    downloads = containers.find('div', {'class': 'downloads'})
    uls = downloads.find('ul')
    lis = uls.find_all('li')[0]
    a = lis.find('a')['href']
    
    """Searching for titles."""
    contents = containers.find('div', {'class': 'content'})
    h2 = contents.find('h2', {'class': 'title'})
    
    hemisphere_image_urls.append({'title': h2.text,
                                  'img_url': a})
print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]
