In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

#### For RiverSP

In [3]:
data_path = '/nas/cee-water/cjgleason/data/SWOT/RiverSP_v2_0_20240624/GR/Reach/'

In [4]:
# Get all filenames in directory
files = []
for file in os.listdir(data_path):
    # will deal with other four types later
    if file.endswith(".shp"):
        files.append(os.path.join(file))

In [5]:
# Make DataFrame of filenames
granules = pd.DataFrame({'files': files})

In [6]:
granules['cycle'] = granules['files'].str.slice(25,28)
granules['pass'] = granules['files'].str.slice(29,32)
granules['version'] = granules['files'].str.slice(-11,-7)
granules['counter'] = granules['files'].str.slice(-6,-4)
# granules['date_0'] = granules['files'].str.slice(36,44) # don't think we need this bc cycle captures

In [7]:
granules

Unnamed: 0,files,cycle,pass,version,counter
0,SWOT_L2_HR_RiverSP_Reach_476_009_GR_20230331T051911_20230331T051922_PGC0_01.shp,476,009,PGC0,01
1,SWOT_L2_HR_RiverSP_Reach_476_016_GR_20230331T103031_20230331T103037_PGC0_01.shp,476,016,PGC0,01
2,SWOT_L2_HR_RiverSP_Reach_478_009_GR_20230402T050027_20230402T050038_PGC0_01.shp,478,009,PGC0,01
3,SWOT_L2_HR_RiverSP_Reach_478_016_GR_20230402T101147_20230402T101152_PGC0_01.shp,478,016,PGC0,01
4,SWOT_L2_HR_RiverSP_Reach_479_009_GR_20230403T045104_20230403T045106_PGC0_01.shp,479,009,PGC0,01
...,...,...,...,...,...
1569,SWOT_L2_HR_RiverSP_Reach_017_038_GR_20240620T091836_20240620T091848_PIC0_01.shp,017,038,PIC0,01
1570,SWOT_L2_HR_RiverSP_Reach_017_039_GR_20240620T110004_20240620T110010_PIC0_01.shp,017,039,PIC0,01
1571,SWOT_L2_HR_RiverSP_Reach_017_040_GR_20240620T110253_20240620T110302_PIC0_01.shp,017,040,PIC0,01
1572,SWOT_L2_HR_RiverSP_Reach_017_042_GR_20240620T124434_20240620T124435_PIC0_01.shp,017,042,PIC0,01


In [8]:
dupe = granules[granules.duplicated(['cycle', 'pass'])]

In [9]:
dupe

Unnamed: 0,files,cycle,pass,version,counter
115,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_02.shp,1,229,PGC0,2
132,SWOT_L2_HR_RiverSP_Reach_001_292_GR_20230731T150755_20230731T150804_PGC0_02.shp,1,292,PGC0,2
346,SWOT_L2_HR_RiverSP_Reach_004_484_GR_20231009T015814_20231009T015817_PGC0_02.shp,4,484,PGC0,2
354,SWOT_L2_HR_RiverSP_Reach_004_511_GR_20231010T015628_20231010T015636_PGC0_02.shp,4,511,PGC0,2
678,SWOT_L2_HR_RiverSP_Reach_010_007_GR_20240125T061336_20240125T061339_PIC0_02.shp,10,7,PIC0,2
680,SWOT_L2_HR_RiverSP_Reach_010_009_GR_20240125T075948_20240125T075953_PIC0_02.shp,10,9,PIC0,2
682,SWOT_L2_HR_RiverSP_Reach_010_010_GR_20240125T080233_20240125T080236_PIC0_02.shp,10,10,PIC0,2
684,SWOT_L2_HR_RiverSP_Reach_010_011_GR_20240125T094354_20240125T094357_PIC0_02.shp,10,11,PIC0,2
686,SWOT_L2_HR_RiverSP_Reach_010_012_GR_20240125T094631_20240125T094639_PIC0_02.shp,10,12,PIC0,2
688,SWOT_L2_HR_RiverSP_Reach_010_014_GR_20240125T112821_20240125T112832_PIC0_02.shp,10,14,PIC0,2


In [10]:
# Example with two version of PGC0, _01 comes first
granules[(granules['cycle'] == '001') & (granules['pass'] == '229')]

Unnamed: 0,files,cycle,pass,version,counter
114,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_01.shp,1,229,PGC0,1
115,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_02.shp,1,229,PGC0,2


In [11]:
# Sort the files
granules = granules.sort_values(by=['cycle', 'pass', 'version', 'counter'],
                                ascending=[True, True, True, False])

In [12]:
# Example with two version of PGC0, now _01 comes first
granules[(granules['cycle'] == '001') & (granules['pass'] == '229')]

Unnamed: 0,files,cycle,pass,version,counter
115,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_02.shp,1,229,PGC0,2
114,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_01.shp,1,229,PGC0,1


In [13]:
# Keep only the best version of each granule
granules = granules.drop_duplicates(subset=['cycle', 'pass'], keep='first')

In [14]:
granules[(granules['cycle'] == '001') & (granules['pass'] == '229')]

Unnamed: 0,files,cycle,pass,version,counter
115,SWOT_L2_HR_RiverSP_Reach_001_229_GR_20230729T094809_20230729T094818_PGC0_02.shp,1,229,PGC0,2


In [16]:
keep_files = list(granules['files'])

In [75]:
# NEED TO BE ABLE TO READ IN ALL FIVE FILE TYPES FOR EACH KEPT GRANULE'S
# FILENAME.

#### For PIXC

In [3]:
data_path = '/nas/cee-water/cjgleason/fiona/data_downloads/'

In [4]:
# Get all filenames in dir
files = []
for file in os.listdir(data_path):
    files.append(os.path.join(file))

In [24]:
# Make DataFrame of filenames
granules = pd.DataFrame({'files': files})

In [25]:
# Extract cycle, pass, tile, version, and counter
granules['cycle'] = granules['files'].str.slice(16, 19)
granules['pass'] = granules['files'].str.slice(20, 23)
granules['tile'] = granules['files'].str.slice(24, 28)
granules['version'] = granules['files'].str.slice(-10, -6)
granules['counter'] = granules['files'].str.slice(-5, -3)

In [26]:
granules

Unnamed: 0,files,cycle,pass,tile,version,counter
0,SWOT_L2_HR_PIXC_474_013_237L_20230329T085422_20230329T085433_PGC0_01.nc,474,013,237L,PGC0,01
1,SWOT_L2_HR_PIXC_474_013_238L_20230329T085432_20230329T085443_PGC0_01.nc,474,013,238L,PGC0,01
2,SWOT_L2_HR_PIXC_475_013_237L_20230330T084500_20230330T084511_PGC0_01.nc,475,013,237L,PGC0,01
3,SWOT_L2_HR_PIXC_478_013_237L_20230402T081652_20230402T081703_PGC0_01.nc,478,013,237L,PGC0,01
4,SWOT_L2_HR_PIXC_475_013_238L_20230330T084510_20230330T084521_PGC0_01.nc,475,013,238L,PGC0,01
...,...,...,...,...,...,...
2876,SWOT_L2_HR_PIXC_017_552_071L_20240708T181403_20240708T181414_PIC0_01.nc,017,552,071L,PIC0,01
2877,SWOT_L2_HR_PIXC_018_039_237L_20240711T073431_20240711T073442_PIC0_01.nc,018,039,237L,PIC0,01
2878,SWOT_L2_HR_PIXC_018_246_071R_20240718T163621_20240718T163632_PIC0_01.nc,018,246,071R,PIC0,01
2879,SWOT_L2_HR_PIXC_018_039_238L_20240711T073441_20240711T073452_PIC0_01.nc,018,039,238L,PIC0,01


In [27]:
# View duplicates
dupe = granules[granules.duplicated(['cycle', 'pass', 'tile'])]

In [28]:
len(dupe)

36

In [8]:
# Example with two versions of PGC0, _02 comes first
granules[(granules['cycle'] == '006') & (granules['pass'] == '552') & (granules['tile'] == '071L')]

Unnamed: 0,files,cycle,pass,tile,version,counter
218,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PIC0_01.nc,6,552,071L,PIC0,1
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1


In [9]:
# Example with PGC0 vs PIC0, PIC0 comes first
granules[(granules['cycle'] == '006') & (granules['pass'] == '552')]

Unnamed: 0,files,cycle,pass,tile,version,counter
218,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PIC0_01.nc,6,552,071L,PIC0,1
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1
220,SWOT_L2_HR_PIXC_006_552_072L_20231122T055822_20231122T055833_PIC0_01.nc,6,552,072L,PIC0,1
221,SWOT_L2_HR_PIXC_006_552_072L_20231122T055822_20231122T055833_PGC0_01.nc,6,552,072L,PGC0,1


In [10]:
# Sort the files
granules = granules.sort_values(by=['cycle', 'pass', 'tile', 'version', 'counter'],
                                ascending=[True, True, True, True, False])

In [11]:
# Example with two versions of PGC0, _02 comes first
granules[(granules['cycle'] == '006') & (granules['pass'] == '552') & (granules['tile'] == '071L')]

Unnamed: 0,files,cycle,pass,tile,version,counter
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1
218,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PIC0_01.nc,6,552,071L,PIC0,1


In [12]:
# Example with PGC0 vs PIC0, now PGC0 comes first
granules[(granules['cycle'] == '006') & (granules['pass'] == '552')]

Unnamed: 0,files,cycle,pass,tile,version,counter
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1
218,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PIC0_01.nc,6,552,071L,PIC0,1
221,SWOT_L2_HR_PIXC_006_552_072L_20231122T055822_20231122T055833_PGC0_01.nc,6,552,072L,PGC0,1
220,SWOT_L2_HR_PIXC_006_552_072L_20231122T055822_20231122T055833_PIC0_01.nc,6,552,072L,PIC0,1


In [13]:
# Keep only the best version of each granule
granules = granules.drop_duplicates(subset=['cycle', 'pass', 'tile'], keep='first')

In [14]:
# Example with two versions of PGC0, _02 comes first
granules[(granules['cycle'] == '006') & (granules['pass'] == '552') & (granules['tile'] == '071L')]

Unnamed: 0,files,cycle,pass,tile,version,counter
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1


In [30]:
# Example with PGC0 vs PIC0, only PGC0 remains
granules[(granules['cycle'] == '006') & (granules['pass'] == '552')]

Unnamed: 0,files,cycle,pass,tile,version,counter
219,SWOT_L2_HR_PIXC_006_552_071L_20231122T055812_20231122T055823_PGC0_01.nc,6,552,071L,PGC0,1
221,SWOT_L2_HR_PIXC_006_552_072L_20231122T055822_20231122T055833_PGC0_01.nc,6,552,072L,PGC0,1


In [17]:
# Get list of just the best/latest version of each granule
keep_files = list(granules['files'])

In [19]:
len(files)

2881

In [20]:
len(keep_files)

2845