## Semi-auto Stream in Landsat from USGS
Solution: Use brower-cookie3 to "sign" the URL request so a python script can stream data from USGS
Cons: Low bandwidth ~200KB/s 

In [None]:
#%pip install browser-cookie3

In [2]:
# background step: login in USGS EROS so the brower cookie can skip the redirect of USGS when request
import browser_cookie3
cj = browser_cookie3.firefox()

In [4]:
url="""https://landsatlook.usgs.gov/data/collection02/level-2/standard/oli-tirs/2020/047/027/LC08_L2SP_047027_20201204_20210313_02_T1/LC08_L2SP_047027_20201204_20210313_02_T1_SR_B7.TIF"""

In [11]:
import requests
r = requests.get(url, stream = True,cookies=cj)

In [8]:
import shutil
fname = "LC08_L2SP_047027_20201204_20210313_02_T1_SR_B7.TIF"
if r.status_code == 200:
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
    r.raw.decode_content = True
    
    # Open a local file with wb ( write binary ) permission.
    with open(fname,'wb') as f:
        shutil.copyfileobj(r.raw, f)

In [None]:
import pandas as pd
from dateutil.rrule import rrule, DAILY
import urllib.request, json 

## collect inventory from USGS
Query guide: https://landsatlook.usgs.gov/stac-server/api.html#tag/Item-Search


Example URL:
https://landsatlook.usgs.gov/stac-server/collections/landsat-c2l1/items?limit=10000&datetime=2022-01-06T00:00:00Z/2022-01-07T00:00:00Z&fields=id,-type,-geometry,-bbox,-properties,-links,-assets,-collection,-features

In [None]:


a = date(2022, 9, 30)
b = date(2022, 10, 8)

for dt in rrule(DAILY, dtstart=a, until=b):
    print(dt.strftime("%Y-%m-%d"))
    yymmdd = dt.strftime("%Y-%m-%d")
    dt

In [None]:
url = f'''https://landsatlook.usgs.gov/stac-server/collections/landsat-c2l1/items?limit=10000&datetime={yymmdd}T00:00:00Z/{yymmdd}T23:59:59Z&fields=id,-type,-geometry,-bbox,properties,-links,-assets,-collection,-features'''

In [None]:
stat_list = []

In [None]:

with urllib.request.urlopen(url) as uo:
    data = json.load(uo)
    #print(data)
    feature_df = pd.json_normalize(data['features'])
    l9 = feature_df.query('`properties.platform`=="LANDSAT_9"')
    l8 = feature_df.query('`properties.platform`=="LANDSAT_8"')
    stat_list.append({'date':yymmdd,'total':data['numberMatched'],'return':data['numberReturned'],'LC09':len(l9),'LC08':len(l9)})
    feature_df.drop('type',axis=1).to_csv(f'../../../data/Landsat/stac/{dt.strftime("%Y-%m-%d")}_C2L1.csv',index=False)


In [None]:
pd.DataFrame(stat_list).query('`date` > "2021-09-27"').to_csv('../../../data/Landsat/day_sum_LC09_LC08.csv',index=False)

In [None]:
day_sum = pd.DataFrame(stat_list).query('`date` > "2021-09-27"')

In [None]:
pd.json_normalize(data['features']).query('`properties.platform`=="LANDSAT_9"')

### Batch processing

In [None]:
a = date(2021, 9, 15)
b = date(2022, 10, 8)
stat_list=[]
for dt in rrule(DAILY, dtstart=a, until=b):
    print(dt.strftime("%Y-%m-%d"))
    yymmdd = dt.strftime("%Y-%m-%d")
    url = f'''https://landsatlook.usgs.gov/stac-server/collections/landsat-c2l1/items?limit=10000&datetime={yymmdd}T00:00:00Z/{yymmdd}T23:59:59Z&fields=id,-type,-geometry,-bbox,properties,-links,-assets,-collection,-features'''
    try:
        with urllib.request.urlopen(url) as uo:
            data = json.load(uo)
            #print(data)
            feature_df = pd.json_normalize(data['features'])
            l9 = feature_df.query('`properties.platform`=="LANDSAT_9"')
            l8 = feature_df.query('`properties.platform`=="LANDSAT_8"')
            stat_list.append({'date':yymmdd,'total':data['numberMatched'],'return':data['numberReturned'],'LC09':len(l9),'LC08':len(l8)})
            if len(l9)>0:
                l9.drop('type',axis=1).to_csv(f'../../../data/Landsat/stac/{dt.strftime("%Y-%m-%d")}_C2L1_LC09.csv',index=False)
            else:
                print('no LC09')
            print(stat_list[-1])
    except e:
        print(e)

## compare file entries on local node
First run `find . -type f -name '*.tar' -exec basename {} .tar \; > landsat_avail.txt` on local inventory

In [None]:
glad21 = pd.read_csv("../../../data/Landsat/landsat9_avail_2021.txt",names=['scene_id'], header=None)

In [None]:
glad22 = pd.read_csv("../../../data/Landsat/landsat_avail_2022.txt",names=['scene_id'], header=None)

In [None]:
glad_inventory = pd.concat([glad21,glad22[glad22.scene_id.str.startswith("LC09")]])

In [None]:
import glob

In [None]:
usgs_inventory = pd.concat([pd.read_csv(f) for f in glob.glob('../../../data/Landsat/stac/*.csv')])

In [None]:
glad_inventory

In [None]:
inventory_merged = usgs_inventory.merge(glad_inventory,left_on='id',right_on='scene_id',how='left')

In [None]:
missing = inventory_merged.query('scene_id.isnull()',engine='python')

In [None]:
missing.shape

In [None]:
missing[['id']].to_csv('../../../data/Landsat/missing_id.csv',index=False)

In [None]:
missing.dtypes

### Spatial and temporal visualization on the missing scenes

In [None]:
import seaborn as sns

In [None]:
pathrow_missing = missing.groupby(["properties.landsat:wrs_path","properties.landsat:wrs_row"]).agg({'id':'count'}).reset_index().set_axis(['path','row','count'],axis=1)\
    .pivot(index='row',columns='path',values='count').fillna(0)

In [None]:
sns.heatmap(pathrow_missing,cbar_kws={'label': 'missing scenes'})


In [None]:
missing['properties.datetime'] = missing['properties.datetime'].astype('datetime64[ns]')

In [None]:
sns.displot(data=missing,x='properties.datetime',aspect=4,bins=360)