In [1]:
# General
import os
import shutil
from pathlib import Path
import pandas as pd

# Web scraping
import requests as rq
from bs4 import BeautifulSoup

In [2]:
# Set max col width to see URLs
pd.set_option('display.max_colwidth', None)

In [3]:
# Check available Landsat-8 scenes from Amazon S3
s3_scenes = pd.read_csv('http://landsat-pds.s3.amazonaws.com/c1/L8/scene_list.gz', compression='gzip')

In [4]:
# Filter scenes to get only those we want
filter_scenes = s3_scenes[(s3_scenes.path == 149) &
                         (s3_scenes.row == 38) & 
                         (~s3_scenes.productId.str.contains("_T2")) &
                         (~s3_scenes.productId.str.contains("_RT"))]

In [12]:
filter_scenes

Unnamed: 0,productId,entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url
354,LC08_L1TP_149038_20170411_20170415_01_T1,LC81490382017101LGN00,2017-04-11 05:36:05.467366,0.05,L1TP,149,38,30.66234,72.77478,32.78280,75.23908,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20170411_20170415_01_T1/index.html
7561,LC08_L1TP_149038_20170326_20170329_01_T1,LC81490382017085LGN00,2017-03-26 05:36:12.517157,0.00,L1TP,149,38,30.66260,72.79079,32.78308,75.25161,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20170326_20170329_01_T1/index.html
11833,LC08_L1TP_149038_20170310_20170317_01_T1,LC81490382017069LGN00,2017-03-10 05:36:22.300401,19.16,L1TP,149,38,30.66255,72.78759,32.78302,75.24848,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20170310_20170317_01_T1/index.html
29980,LC08_L1TP_149038_20170427_20170515_01_T1,LC81490382017117LGN00,2017-04-27 05:35:54.853904,0.94,L1TP,149,38,30.66234,72.77478,32.78280,75.23908,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20170427_20170515_01_T1/index.html
46908,LC08_L1TP_149038_20170513_20170525_01_T1,LC81490382017133LGN00,2017-05-13 05:36:00.166180,18.10,L1TP,149,38,30.66021,72.81000,32.78342,75.27353,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20170513_20170525_01_T1/index.html
...,...,...,...,...,...,...,...,...,...,...,...,...
2068346,LC08_L1TP_149038_20200809_20200821_01_T1,LC81490382020222LGN00,2020-08-09 05:36:33.996902,10.32,L1TP,149,38,30.66031,72.81640,32.78353,75.27979,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200809_20200821_01_T1/index.html
2079687,LC08_L1TP_149038_20200825_20200905_01_T1,LC81490382020238LGN00,2020-08-25 05:36:41.783784,32.84,L1TP,149,38,30.66016,72.80680,32.78336,75.27039,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200825_20200905_01_T1/index.html
2085900,LC08_L1TP_149038_20200825_20200905_01_T1,LC81490382020238LGN00,2020-08-25 05:36:41.783784,32.84,L1TP,149,38,30.66016,72.80680,32.78336,75.27039,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200825_20200905_01_T1/index.html
2099002,LC08_L1TP_149038_20200910_20200918_01_T1,LC81490382020254LGN00,2020-09-10 05:36:48.739707,0.16,L1TP,149,38,30.65995,72.79399,32.78314,75.26100,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200910_20200918_01_T1/index.html


In [10]:
# Sort scenes to get most recent value first
filter_scenes.sort_values('acquisitionDate', ascending=False).head(10)

Unnamed: 0,productId,entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url
2116637,LC08_L1TP_149038_20200926_20201007_01_T1,LC81490382020270LGN00,2020-09-26 05:36:53.547874,36.71,L1TP,149,38,30.6599,72.79079,32.78308,75.25473,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200926_20201007_01_T1/index.html
2099002,LC08_L1TP_149038_20200910_20200918_01_T1,LC81490382020254LGN00,2020-09-10 05:36:48.739707,0.16,L1TP,149,38,30.65995,72.79399,32.78314,75.261,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200910_20200918_01_T1/index.html
2079687,LC08_L1TP_149038_20200825_20200905_01_T1,LC81490382020238LGN00,2020-08-25 05:36:41.783784,32.84,L1TP,149,38,30.66016,72.8068,32.78336,75.27039,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200825_20200905_01_T1/index.html
2085900,LC08_L1TP_149038_20200825_20200905_01_T1,LC81490382020238LGN00,2020-08-25 05:36:41.783784,32.84,L1TP,149,38,30.66016,72.8068,32.78336,75.27039,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200825_20200905_01_T1/index.html
2068346,LC08_L1TP_149038_20200809_20200821_01_T1,LC81490382020222LGN00,2020-08-09 05:36:33.996902,10.32,L1TP,149,38,30.66031,72.8164,32.78353,75.27979,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200809_20200821_01_T1/index.html
2054776,LC08_L1TP_149038_20200724_20200807_01_T1,LC81490382020206LGN00,2020-07-24 05:36:30.784134,7.2,L1TP,149,38,30.66011,72.8036,32.78331,75.26726,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200724_20200807_01_T1/index.html
2051386,LC08_L1TP_149038_20200708_20200721_01_T1,LC81490382020190LGN00,2020-07-08 05:36:25.545774,33.83,L1TP,149,38,30.66,72.79719,32.78319,75.261,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200708_20200721_01_T1/index.html
2028214,LC08_L1TP_149038_20200622_20200707_01_T1,LC81490382020174LGN00,2020-06-22 05:36:18.328941,1.78,L1TP,149,38,30.66,72.79713,32.7859,75.261,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200622_20200707_01_T1/index.html
2000860,LC08_L1TP_149038_20200606_20200608_01_T1,LC81490382020158LGN00,2020-06-06 05:36:09.092175,72.36,L1TP,149,38,30.66011,72.8036,32.78331,75.26726,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200606_20200608_01_T1/index.html
1986340,LC08_L1TP_149038_20200521_20200527_01_T1,LC81490382020142LGN00,2020-05-21 05:36:02.094923,0.05,L1TP,149,38,30.66005,72.80033,32.78595,75.26413,https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200521_20200527_01_T1/index.html


In [6]:
# Create dir to hold rasters
"""Will eventually be a db"""
work_dir_path = os.path.join(str(Path.home()), "Desktop", "forest_mon", "data", "L8")

In [7]:
# Test url
download_url = "https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/149/038/LC08_L1TP_149038_20200809_20200821_01_T1/index.html"

In [8]:
# Get response item
response = rq.get(download_url)

# Get html items
html = BeautifulSoup(response.content, 'html.parser')

In [13]:
# Define bands to download - don't want all
bands = ["B2", "B3", "B4", "B5", "QA"]

# Loop thru li to get names
for li in html.find_all('li'):
    
    # Get the <href> attribute from each <a>
    f = li.find_next('a').get('href')
    
    # Get the band name: first part of filename
    # Start by splitting filename at the dots
    fname_list = f.split(".")
    
    # Get only some bands
    if (fname_list[0].endswith(tuple(bands)) and not fname_list[-1] == "ovr"):
        
        # Create download url for this scene
        scene_url = download_url.replace("index.html", f)
        
        print(f"Downloading {f}...")
        
        # Get response item for this particular band in scene
        band_response = rq.get(scene_url, stream=True)
        
        # Create unique image path
        image_path = os.path.join(work_dir_path, f)
        
        # Save the image
        with open(image_path, 'wb') as out_file:
            shutil.copyfileobj(band_response.raw, out_file)
        
        # Delete the response... for... cleanliness?
        del band_response
        
        print("...complete\n")

Downloading LC08_L1TP_149038_20200809_20200821_01_T1_BQA.TIF...
...complete

Downloading LC08_L1TP_149038_20200809_20200821_01_T1_B4.TIF...
...complete

Downloading LC08_L1TP_149038_20200809_20200821_01_T1_B3.TIF...
...complete

Downloading LC08_L1TP_149038_20200809_20200821_01_T1_B2.TIF...
...complete

Downloading LC08_L1TP_149038_20200809_20200821_01_T1_B5.TIF...
...complete

