In [1]:
from  splinter import Browser 
from bs4 import BeautifulSoup as bs
import time 
from webdriver_manager.chrome import ChromeDriverManager 
import pandas as pd 

In [2]:
def scrape_mars_info():
    # Set Up Splinter 
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    
    # Visit Mars Website 
    url = 'https://redplanetscience.com/'
    browser.visit(url)

    time.sleep(1)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Get Titles And Paragraphs
    news_block = soup.find('div', class_='list_text')
    
    recent_title = news_block.find_all('div', class_='content_title')[0].text
    recent_p = news_block.find_all('div', class_='article_teaser_body')[0].text
    
    # Store Data In Dictionary 
    mars_data = {
        'recent_title': recent_title,
        'recent_parag': recent_p
    }
    
    browser.quit()
    
    return mars_data 

In [3]:
scrape_mars_info()



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


{'recent_title': "NASA's Mars Helicopter Attached to Mars 2020 Rover ",
 'recent_parag': 'The helicopter will be first aircraft to perform flight tests on another planet.'}

In [4]:
def scrape_featured_space_img():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    
    url = 'https://spaceimages-mars.com'
    browser.visit(url)

    time.sleep(1)
    
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Scrape URL For The Featured Image 
    featured_img_block = soup.find('div', class_='floating_text_area')
    
    featured_image_path = featured_img_block.find('a')["href"]
    featured_image_url = f'{url}/{featured_image_path}'
    
    featured_img = {
        'featured_img_url':featured_image_url
    }
    
    browser.quit()
    
    return featured_img

In [5]:
scrape_featured_space_img()



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


{'featured_img_url': 'https://spaceimages-mars.com/image/featured/mars1.jpg'}

In [6]:
def scrape_mars_facts_table():
    url = 'https://galaxyfacts-mars.com'
    tables = pd.read_html(url)
    df = tables[0]
    rename_column_df = df.rename(columns={0:'Mars - Earth Comparison', 1:'Mars', 2:'Earth'}, inplace=False)
    drop_row_df = rename_column_df.drop([0])
    clean_df = drop_row_df.set_index('Mars - Earth Comparison')
    html_table = clean_df.to_html()
    clean_html = html_table.replace('\n', '')
    return clean_html

In [7]:
scrape_mars_facts_table()

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars</th>      <th>Earth</th>    </tr>    <tr>      <th>Mars - Earth Comparison</th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Diameter:</th>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>Moons:</th>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>Distance from Sun:</th>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>Length of Year:</th>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>Temperature:</th>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

In [8]:
def scrape_mars_hemispheres():
    hemisphere_urls = [
        'https://marshemispheres.com/cerberus.html',
        'https://marshemispheres.com/schiaparelli.html', 
        'https://marshemispheres.com/syrtis.html',
        'https://marshemispheres.com/valles.html'
    ]
    
    url_list = []
    
    for i in range(len(hemisphere_urls)):
        executable_path = {'executable_path': ChromeDriverManager().install()}
        browser = Browser('chrome', **executable_path, headless=False)
    
        image_url = 'https://marshemispheres.com/'
    
        browser.visit(hemisphere_urls[i])

        time.sleep(1)
    
        html = browser.html
        soup = bs(html, "html.parser")
    
        hemisphere_block = soup.find('div', class_ = 'downloads')
        hemisphere_ulsection = hemisphere_block.find('ul')
        hemisphere_lisection = hemisphere_ulsection.find_all('li')[0]
        hemisphere_path = hemisphere_lisection.find('a')["href"]
        hemisphere_url = image_url + hemisphere_path
    
        hemispheretitle = soup.find('h2', class_ = 'title').text
    
        hemisphere_image_title_urls = {
            'title':hemispheretitle,'img_url':hemisphere_url
        }
        
        url_list.append(hemisphere_image_title_urls)
    
        browser.quit()
    
    return url_list

In [9]:
scrape_mars_hemispheres()



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]

In [10]:
# Checking Scrape_Mars.py

# def scrape():
#     url_list = [
#         'https://redplanetscience.com/',
#         'https://spaceimages-mars.com',
#         'https://galaxyfacts-mars.com',
#         'https://marshemispheres.com/cerberus.html',
#         'https://marshemispheres.com/schiaparelli.html',
#         'https://marshemispheres.com/syrtis.html',
#         'https://marshemispheres.com/valles.html'
#     ]

#     loop_count = 0
#     count_hemispheres = 0 
#     hemisphere_list = []

#     for i in range(len(url_list)):
#         executable_path = {'executable_path': ChromeDriverManager().install()}
#         browser = Browser('chrome', **executable_path, headless=False)

#         browser.visit(url_list[i])

#         time.sleep(1)
        
#         html = browser.html
#         soup = bs(html, "html.parser")

#         if loop_count == 0:
#             news_block = soup.find('div', class_='list_text')
    
#             recent_title = news_block.find_all('div', class_='content_title')[0].text
#             recent_p = news_block.find_all('div', class_='article_teaser_body')[0].text

#             mars_scrapedata = {
#                 'recent_title': recent_title,
#                 'recent_parag': recent_p
#             }

#             browser.quit()
        
#         if loop_count == 1:
#             featured_img_block = soup.find('div', class_='floating_text_area')
    
#             featured_image_path = featured_img_block.find('a')["href"]
#             featured_image_url = f'{url_list[i]}/{featured_image_path}'

#             mars_scrapedata['featured_img_url'] = featured_image_url

#             browser.quit()
        
#         if loop_count == 2:
#             tables = pd.read_html(url_list[i])
#             df = tables[0]
#             mars_html_table = df.to_html()
#             clean_html = mars_html_table.replace('\n', '')
#             mars_scrapedata['mars_table'] = clean_html
#             browser.quit()
            
#         if loop_count > 2:
#             image_url = 'https://marshemispheres.com/'
            
#             hemisphere_block = soup.find('div', class_ = 'downloads')
#             hemisphere_ulsection = hemisphere_block.find('ul')
#             hemisphere_lisection = hemisphere_ulsection.find_all('li')[0]
#             hemisphere_path = hemisphere_lisection.find('a')["href"]
#             hemisphere_url = image_url + hemisphere_path
    
#             hemispheretitle = soup.find('h2', class_ = 'title').text
    
#             hemisphere_image_title_urls = {
#                 'title':hemispheretitle,'img_url':hemisphere_url
#             }
        
#             hemisphere_list.append(hemisphere_image_title_urls)

#             browser.quit()
            
#             count_hemispheres += 1
            
#             if count_hemispheres == 4:
#                 mars_scrapedata['hemisphere_urls'] = hemisphere_list
                    
#         loop_count += 1   
            
#     return mars_scrapedata

In [11]:
# scrape()



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [/Users/darrensagucio/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Current google-chrome version is 90.0.4430
Get LATEST driv

{'recent_title': "InSight's 'Mole' Team Peers into the Pit",
 'recent_parag': 'Efforts to save the heat probe continue.',
 'featured_img_url': 'https://spaceimages-mars.com/image/featured/mars2.jpg',
 'mars_table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>      <th>2</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Mars - Earth Comparison</td>      <td>Mars</td>      <td>Earth</td>    </tr>    <tr>      <th>1</th>      <td>Diameter:</td>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>4</th>      <td>Distance from Sun:</td>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>5</th>      <td>Length of Year:</td>      <td>687 Earth da