In [2]:
# General
import os
import shutil
from pathlib import Path
import pandas as pd

# Web scraping
import requests as rq
from bs4 import BeautifulSoup

In [3]:
# Set max col width to see URLs
pd.set_option('display.max_colwidth', None)

In [4]:
# Check available Landsat-8 scenes from Amazon S3
s3_scenes = pd.read_csv('http://landsat-pds.s3.amazonaws.com/c1/L8/scene_list.gz', compression='gzip')

In [5]:
# Filter scenes to get only those we want
filter_scenes = s3_scenes[(s3_scenes.path == 149) &
                         (s3_scenes.row == 38) & 
                         (~s3_scenes.productId.str.contains("_T2")) &
                         (~s3_scenes.productId.str.contains("_RT"))]

In [7]:
# Sort scenes to get most recent value first
filter_scenes.sort_values('acquisitionDate', ascending=False).head(4)

Unnamed: 0,productId,entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url
2143782,LC08_L1TP_149038_20201028_20201106_01_T1,LC81490382020302LGN00,2020-10-28 05:36:55.374455,0.02,L1TP,149,38,30.66,72.79719,32.78319,75.261,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20201028_20201106_01_T1/index.html
2116637,LC08_L1TP_149038_20200926_20201007_01_T1,LC81490382020270LGN00,2020-09-26 05:36:53.547874,36.71,L1TP,149,38,30.6599,72.79079,32.78308,75.25473,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200926_20201007_01_T1/index.html
2099002,LC08_L1TP_149038_20200910_20200918_01_T1,LC81490382020254LGN00,2020-09-10 05:36:48.739707,0.16,L1TP,149,38,30.65995,72.79399,32.78314,75.261,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200910_20200918_01_T1/index.html
2079687,LC08_L1TP_149038_20200825_20200905_01_T1,LC81490382020238LGN00,2020-08-25 05:36:41.783784,32.84,L1TP,149,38,30.66016,72.8068,32.78336,75.27039,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200825_20200905_01_T1/index.html


In [36]:
# Get index position for most recent scene
recent_idx = filter_scenes[filter_scenes.acquisitionDate == filter_scenes.acquisitionDate.max()].index

# Extract url from df
url = filter_scenes.loc[idx].download_url.tolist()

# Remove index number
download_url = url[0]

# Print to check
print(download_url)

https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20201028_20201106_01_T1/index.html


In [37]:
# Create dir to hold rasters
"""Will eventually be a db"""
work_dir_path = os.path.join(str(Path.home()), "Desktop", "forest_mon", "data", "L8")

In [38]:
# Get response item
response = rq.get(download_url)

# Get html items
page_html = BeautifulSoup(response.content, 'html.parser')

In [41]:
# Define bands to download - don't want all
bands = ["B2", "B3", "B4", "B5", "QA"]

# Loop thru li to get names
for li in page_html.find_all('li'):
    
    # Get the <href> attribute from each <a>
    f = li.find_next('a').get('href')
    
    # Get the band name: first part of filename
    # Start by splitting filename at the dots
    fname_list = f.split(".")
    
    # Get only some bands: filename ends in "bands" above
    # and is not a .ovr file
    if (fname_list[0].endswith(tuple(bands)) and not fname_list[-1] == "ovr"):
        
        # Create download url for this scene
        scene_url = download_url.replace("index.html", f)
        
        print(f"Downloading {f}...")
        
        # Get response item for this particular band in scene
        band_response = rq.get(scene_url, stream=True)
        
        # Create unique image path
        image_path = os.path.join(work_dir_path, f)
        
        # Save the image
        with open(image_path, 'wb') as out_file:
            shutil.copyfileobj(band_response.raw, out_file)
        
        # Delete the response... for... cleanliness?
        del band_response
        
        print("...complete\n")

Downloading LC08_L1TP_149038_20201028_20201106_01_T1_B2.TIF...
...complete

Downloading LC08_L1TP_149038_20201028_20201106_01_T1_B4.TIF...
...complete

Downloading LC08_L1TP_149038_20201028_20201106_01_T1_B5.TIF...
...complete

Downloading LC08_L1TP_149038_20201028_20201106_01_T1_BQA.TIF...
...complete

Downloading LC08_L1TP_149038_20201028_20201106_01_T1_B3.TIF...
...complete

