In [1]:
# General
import os
import shutil
from pathlib import Path
import pandas as pd

# Web scraping
import requests as rq
from bs4 import BeautifulSoup

In [2]:
# Set max col width to see URLs
pd.set_option('display.max_colwidth', None)

In [3]:
# Check available Landsat-8 scenes from Amazon S3
s3_scenes = pd.read_csv('http://landsat-pds.s3.amazonaws.com/c1/L8/scene_list.gz', compression='gzip')

In [7]:
# Filter scenes to get only those we want
filter_scenes = s3_scenes[(s3_scenes.path == 108) &
                         (s3_scenes.row == 29) & 
                         (~s3_scenes.productId.str.contains("_T2")) &
                         (~s3_scenes.productId.str.contains("_RT"))]

In [8]:
# Sort scenes to get most recent value first
filter_scenes.sort_values('acquisitionDate', ascending=False).head(4)

Unnamed: 0,productId,entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url
2143328,LC08_L1TP_108029_20201029_20201105_01_T1,LC81080292020303LGN00,2020-10-29 01:19:56.787745,72.67,L1TP,108,29,43.52281,139.84209,45.66428,142.77196,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/108/029/LC08_L1TP_108029_20201029_20201105_01_T1/index.html
2098069,LC08_L1TP_108029_20200911_20200918_01_T1,LC81080292020255LGN00,2020-09-11 01:19:50.525672,69.32,L1TP,108,29,43.52557,139.83824,45.66424,142.76833,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/108/029/LC08_L1TP_108029_20200911_20200918_01_T1/index.html
2054326,LC08_L1TP_108029_20200725_20200807_01_T1,LC81080292020207LGN01,2020-07-25 01:19:32.506670,27.89,L1TP,108,29,43.5227,139.84979,45.66436,142.77938,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/108/029/LC08_L1TP_108029_20200725_20200807_01_T1/index.html
2049779,LC08_L1TP_108029_20200709_20200721_01_T1,LC81080292020191LGN00,2020-07-09 01:19:27.365750,71.34,L1TP,108,29,43.52281,139.84209,45.66428,142.77196,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/108/029/LC08_L1TP_108029_20200709_20200721_01_T1/index.html


In [10]:
# Get index position for most recent scene
recent_idx = filter_scenes[filter_scenes.acquisitionDate == filter_scenes.acquisitionDate.max()].index

# Extract url from df
url = filter_scenes.loc[recent_idx].download_url.tolist()

# Remove index number
download_url = url[0]

# Print to check
print(download_url)

https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/108/029/LC08_L1TP_108029_20201029_20201105_01_T1/index.html


In [11]:
# Create dir to hold rasters
"""Will eventually be a db"""
work_dir_path = os.path.join(str(Path.home()), "Desktop", "forest_mon", "data", "L8")

In [12]:
# Get response item
response = rq.get(download_url)

# Get html items
page_html = BeautifulSoup(response.content, 'html.parser')

In [13]:
# Define bands to download - don't want all
bands = ["B2", "B3", "B4", "B5", "QA"]

# Loop thru li to get names
for li in page_html.find_all('li'):
    
    # Get the <href> attribute from each <a>
    f = li.find_next('a').get('href')
    
    # Get the band name: first part of filename
    # Start by splitting filename at the dots
    fname_list = f.split(".")
    
    # Get only some bands: filename ends in "bands" above
    # and is not a .ovr file
    if (fname_list[0].endswith(tuple(bands)) and not fname_list[-1] == "ovr"):
        
        # Create download url for this scene
        scene_url = download_url.replace("index.html", f)
        
        print(f"Downloading {f}...")
        
        # Get response item for this particular band in scene
        band_response = rq.get(scene_url, stream=True)
        
        # Create unique image path
        image_path = os.path.join(work_dir_path, f)
        
        # Save the image
        with open(image_path, 'wb') as out_file:
            shutil.copyfileobj(band_response.raw, out_file)
        
        # Delete the response... for... cleanliness?
        del band_response
        
        print("...complete\n")

Downloading LC08_L1TP_108029_20201029_20201105_01_T1_B3.TIF...
...complete

Downloading LC08_L1TP_108029_20201029_20201105_01_T1_B5.TIF...
...complete

Downloading LC08_L1TP_108029_20201029_20201105_01_T1_B4.TIF...
...complete

Downloading LC08_L1TP_108029_20201029_20201105_01_T1_B2.TIF...
...complete

Downloading LC08_L1TP_108029_20201029_20201105_01_T1_BQA.TIF...
...complete

