In [1]:
import re
import opera_utils
import pandas as pd
import geopandas as gpd
import asf_search as asf
from datetime import datetime
from shapely import Polygon
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

### Read in the data and add needed columns

In [49]:
df_burst_geoms = opera_utils.get_burst_id_geojson(True).drop_duplicates(subset='burst_id_jpl', keep='first')
df_burst_geoms['pathNumber'] = df_burst_geoms['burst_id_jpl'].str.extract(r't(\d+)_').astype(int)
df_burst_geoms['flightDirection'] = df_burst_geoms['orbit_pass']
df_burst_geoms = df_burst_geoms.drop(columns=['orbit_pass'])
df_burst_geoms = df_burst_geoms.reset_index(drop=True)
df_burst_geoms.head()

Unnamed: 0,burst_id_jpl,is_land,is_north_america,geometry,pathNumber,flightDirection
0,t001_000025_iw1,1,0,"POLYGON Z ((2.5319 4.75701 0, 3.29631 4.91201 ...",1,ASCENDING
1,t001_000025_iw2,1,0,"POLYGON Z ((3.25912 4.9746 0, 4.06055 5.13657 ...",1,ASCENDING
2,t001_000025_iw3,1,0,"POLYGON Z ((4.02659 5.18854 0, 4.73485 5.32978...",1,ASCENDING
3,t001_000026_iw1,1,0,"POLYGON Z ((2.49611 4.92361 0, 3.26087 5.07851...",1,ASCENDING
4,t001_000026_iw2,1,0,"POLYGON Z ((3.22392 5.14115 0, 4.02565 5.30297...",1,ASCENDING


### Read in SLCs and filter to Priority 1

In [51]:
gdf = gpd.read_file('NApriorityrollout_framebased_v5_aggresive.json')
gdf['flightDirection'] = gdf['orbit_pass']
gdf.drop(columns=['orbit_pass'], inplace=True)
high_priority = gdf[gdf['priority'] == 1.0]
print(f"total high priority frames: {len(high_priority)}")
high_priority.head()

total high priority frames: 275


Unnamed: 0,frame_id,region_name,priority,geometry,flightDirection
0,18635,Oklahoma,1.0,"POLYGON ((-94.47977 33.93097, -94.17424 35.462...",DESCENDING
1,18636,Louisiana,1.0,"POLYGON ((-94.74179 32.60346, -94.43886 34.135...",DESCENDING
2,14881,Sonora,1.0,"POLYGON ((-110.1448 27.46252, -109.8468 28.996...",DESCENDING
3,14882,Chihuahua,1.0,"POLYGON ((-110.40354 26.13349, -110.10488 27.6...",DESCENDING
4,14883,Baja California Sur,1.0,"POLYGON ((-110.6604 24.80394, -110.36372 26.33...",DESCENDING


In [52]:
# Create the 5 dataframes manually by specifying row ranges
df_subset1 = high_priority.iloc[0:55]
df_subset2 = high_priority.iloc[55:110]
df_subset3 = high_priority.iloc[110:165]
df_subset4 = high_priority.iloc[165:220]
df_subset5 = high_priority.iloc[220:275]

In [None]:
map = df_subset1.explore(
    color='red',
    opacity=0.8,
    legend=False
)

map = df_subset2.explore(
    m=map,
    color='blue',
    opacity=0.8,
    legend=False
)

map = df_subset3.explore(
    m=map,
    color='green',
    opacity=0.8,
    legend=False
)

map = df_subset4.explore(
    m=map,
    color='orange',
    opacity=0.8,
    legend=False
)

map = df_subset5.explore(
    m=map,
    color='purple',
    opacity=0.8,
    legend=False
)
map

### Make a new geodataframe containing only burst IDs within the frames of interest
We produce a new subset dataframe containing the CSLC burst IDs that intersect with the frames of interest. the `flightDirection` column is used to ensure only bursts with the same direction (ascending/descending) are returned for the frame. Below we perform the workflow for the first subset of frames `df_subset1`.

<span style="color:red">Note: If you would like to investigate a different subset of CSLC bursts, ensure the subset is a geodataframe with name `df_burst_geoms` and has fields `burst_id_jpl`, `pathNumber`, `flightDirection`, and `geometry` before running the next cell.</span>


In [54]:
# Ensure both GeoDataFrames have the same CRS, assuming WGS84 (EPSG:4326)
df_burst_geoms = df_burst_geoms.set_crs("EPSG:4326", allow_override=True)
high_priority = df_subset1.set_crs("EPSG:4326", allow_override=True)

# Perform the spatial join to find intersections
intersecting_bursts = gpd.sjoin(df_burst_geoms, high_priority, how="inner", predicate="intersects")

# Filter for matching flightDirection and pathNumber
result_df_intersecting = intersecting_bursts[
    (intersecting_bursts['flightDirection_left'] == intersecting_bursts['flightDirection_right'])
]

# Select relevant columns
result_df_intersecting = result_df_intersecting[['burst_id_jpl', 'flightDirection_left', 'geometry', 'frame_id', 'region_name']]

# Rename columns for clarity
result_df_intersecting = result_df_intersecting.rename(columns={
    'frame_id_left': 'frame_id',
    'flightDirection_left': 'flightDirection'
})

# Plot the results
m = high_priority.explore(color='red', opacity=0.75)
result_df_intersecting.explore(m=m, color='blue', opacity=0.05)

### Get path number and add to frames (workaround)
The frames intersect bursts that are not technically part of that frame. So we use their centroid to filter out the bursts that do not belong.

In [55]:
### Find centroid of each frame
frame_centroids = high_priority.copy()
frame_centroids['geometry'] = frame_centroids['geometry'].centroid

### intersect with burst geometries
frame_centroids['center_burst_id'] = None
for i, row in frame_centroids.iterrows():
    for j, burst_geom in result_df_intersecting.iterrows():
        if row['geometry'].intersects(burst_geom['geometry']):
            frame_centroids.at[i, 'center_burst_id'] = burst_geom['burst_id_jpl']
#frame_centroids['pathNumber'] = frame_centroids['center_burst_id'].str.extract(r't(\d+)').astype(int)
frame_centroids['pathNumber'] = frame_centroids['center_burst_id'].str.extract(r't(\d+)')[0].astype('Int64')

high_priority = high_priority.merge(frame_centroids[['frame_id', 'pathNumber']], on='frame_id', how='left')

### Redo intersection with pathNumber as a parameter
We redo the intersection using this filtered geodataframe

In [56]:
# Ensure both GeoDataFrames have the same CRS, assuming WGS84 (EPSG:4326)
df_burst_geoms = df_burst_geoms.set_crs("EPSG:4326", allow_override=True)
high_priority = high_priority.set_crs("EPSG:4326", allow_override=True)

# Perform the spatial join to find intersections
intersecting_bursts = gpd.sjoin(df_burst_geoms, high_priority, how="inner", predicate="intersects")

# Filter for matching flightDirection and pathNumber
result_df_intersecting = intersecting_bursts[
    (intersecting_bursts['flightDirection_left'] == intersecting_bursts['flightDirection_right']) &
    (intersecting_bursts['pathNumber_left'] == intersecting_bursts['pathNumber_right'])
]

# Select relevant columns
result_df_intersecting = result_df_intersecting[['burst_id_jpl', 'flightDirection_left', 'pathNumber_left', 'geometry', 'frame_id', 'region_name']]

# Rename columns for clarity
result_df_intersecting = result_df_intersecting.rename(columns={
    'frame_id_left': 'frame_id',
    'flightDirection_left': 'flightDirection',
    'pathNumber_left': 'pathNumber'
})

# Get unique burst IDs
result_df_intersecting = result_df_intersecting.drop_duplicates(subset='burst_id_jpl', keep='first')

# Plot the results
m = high_priority.explore(color='red', opacity=0.75)
#high_priority.explore(color='red')
result_df_intersecting.explore(m=m, color='blue', opacity=0.05)

### Scale down the dataframe and add a column for SLC burst IDs

In [57]:
burst_ids = pd.DataFrame()
burst_ids['CSLC_burst_id'] = result_df_intersecting['burst_id_jpl']
burst_ids['SLC_burst_id'] = burst_ids['CSLC_burst_id'].str.replace('^t', '', regex=True)
burst_ids['geometry'] = result_df_intersecting['geometry']
burst_ids = burst_ids.reset_index(drop=True)
burst_ids.head()

Unnamed: 0,CSLC_burst_id,SLC_burst_id,geometry
0,t027_056712_iw1,027_056712_iw1,"POLYGON Z ((-109.92708 35.19724 0, -110.86164 ..."
1,t027_056712_iw2,027_056712_iw2,"POLYGON Z ((-110.84829 35.27344 0, -111.83056 ..."
2,t027_056712_iw3,027_056712_iw3,"POLYGON Z ((-111.81462 35.35996 0, -112.68407 ..."
3,t027_056713_iw1,027_056713_iw1,"POLYGON Z ((-109.96634 35.03124 0, -110.89889 ..."
4,t027_056713_iw2,027_056713_iw2,"POLYGON Z ((-110.88558 35.10745 0, -111.86628 ..."


### Find SLC and CSLC sensing dates/versions `asf search`
Below we use `asf_search()` to return all of the metadata for the SCLs and CSLCs. NOTE: This takes a while if you do not use multithreading (usually ~50-60 min for this dataset).

In [None]:
# Do not include start and end dates in the search. These are filtered later

opts_ASFBURST = {
    'dataset':'SLC-BURST',
    'fullBurstID':list(burst_ids['SLC_burst_id']),
    'polarization':'VV'
}

opts_OPERABURST = {
    'dataset':'OPERA-S1',
    'processingLevel':'CSLC',
    'operaBurstID':list(burst_ids['CSLC_burst_id'])
}

print(f"Searching over the range from 2016-07-01 to today for {len(burst_ids)} burst IDs in ASF and OPERA...")
SLC_burst_results = asf.search(**opts_ASFBURST)
CSLC_burst_results = asf.search(**opts_OPERABURST)

In [None]:
print(f"length of SLC results: {len(SLC_burst_results)}")
print(f"length of CSLC results: {len(CSLC_burst_results)}")

### Parse the metadata and add dates/versions to the dataframe

In [None]:
# SLCs

# Minimum date filter
min_date = datetime(2016, 7, 1).date()
max_date = datetime(2024, 11, 15).date()

# Create a dictionary from results
fileID_dict_SLC = defaultdict(list)
for result in SLC_burst_results:
    match = re.search(r'(\d{6}_IW\d)', result.properties['fileID'])
    if match:
        key = match.group(1).lower()  # Normalize case for matching
        # Extract and store only the date part (YYYY-MM-DD)
        date_only = datetime.strptime(result.properties['stopTime'], '%Y-%m-%dT%H:%M:%SZ').date()
        if date_only >= min_date and date_only <= max_date:
            fileID_dict_SLC[key].append(date_only)

# Sort the dates for each fileID
for key in fileID_dict_SLC:
    fileID_dict_SLC[key] = sorted(fileID_dict_SLC[key])

# Map sorted dates to the DataFrame
def get_SLC_dates(row):
    # Extract the relevant portion of the 'id' column
    match = re.search(r'(\d{6}_iw\d)', row['SLC_burst_id'], re.IGNORECASE)
    if match:
        key = match.group(1).lower()
        return fileID_dict_SLC.get(key, [])
    return []

# Apply function to the DataFrame
burst_ids['SLC_sensing_dates'] = burst_ids.apply(get_SLC_dates, axis=1)

In [None]:
# CSLCs

# Minimum date filter
min_date = datetime(2016, 7, 1).date()
max_date = datetime(2024, 11, 15).date()

# Create dictionaries for dates and versions
fileID_dict_CSLC = defaultdict(list)
version_dict = defaultdict(list)

for result in CSLC_burst_results:
    match = re.search(r'T(\d{3})-(\d{6})-IW(\d)', result.properties['fileID'])
    if match:
        key = f"t{match.group(1)}_{match.group(2)}_iw{match.group(3)}".lower()  # Normalize to match 'id_2'
        # Extract and store only the date part (YYYY-MM-DD)
        date_only = datetime.strptime(result.properties['stopTime'], '%Y-%m-%dT%H:%M:%SZ').date()
        if date_only >= min_date and date_only <= max_date and result.properties['pgeVersion'] == '2.1.1':
            fileID_dict_CSLC[key].append(date_only)
            version_dict[key].append((date_only, result.properties['pgeVersion']))  # Store date-version pairs

# Sort the dates and corresponding versions for each fileID
for key in fileID_dict_CSLC:
    # Sort dates
    fileID_dict_CSLC[key] = sorted(fileID_dict_CSLC[key])
    # Sort versions by date
    version_dict[key] = [version for date, version in sorted(version_dict[key])]

# Map sorted dates and versions to the DataFrame
def get_CSLC_dates_and_versions(row):
    key = row['CSLC_burst_id'].lower()
    return fileID_dict_CSLC.get(key, []), version_dict.get(key, [])

# Apply function to the DataFrame
burst_ids[['CSLC_sensing_dates', 'CSLC_versions']] = burst_ids.apply(
    lambda row: pd.Series(get_CSLC_dates_and_versions(row)),
    axis=1
)
burst_ids.head()

### Find SLC burst dates that do not have corresponding CSLC dates and find duplicates
Create a two new columns in the dataframe containing the dates for which SLC bursts exist with no corresponding CSLC bursts and a count of the number of duplicated CSLCs.

In [None]:
from collections import Counter

# Find missing CSLC dates
def find_diff_dates(row):
    cslc_dates_set = set(row['CSLC_sensing_dates'])
    slc_dates_set = set(row['SLC_sensing_dates'])
    diff_dates = list(slc_dates_set - cslc_dates_set)
    diff_dates.sort()
    return diff_dates

# Function to find grouped duplicates with their corresponding versions
def grouped_duplicates_with_versions(dates, versions):
    # Count occurrences of each date
    counts = Counter(dates)
    
    # Group duplicates and their corresponding versions
    grouped_duplicates = defaultdict(list)
    
    for date, version in zip(dates, versions):
        if counts[date] > 1:  # Check if the date is a duplicate
            grouped_duplicates[date].append(version)
    
    # Convert the dictionary to nested lists of duplicates and versions
    duplicate_dates = [[date] * len(versions) for date, versions in grouped_duplicates.items()]
    duplicate_versions = list(grouped_duplicates.values())
    
    return duplicate_dates, duplicate_versions

# Function to count total duplicates (excluding the original occurrence)
def count_total_duplicates(grouped_dates):
    return sum(len(group) - 1 for group in grouped_dates)

# Function to check if each inner list contains '2.1.1'
def check_contains_version(duplicate_versions, target_version='2.1.1'):
    return [target_version in inner_list for inner_list in duplicate_versions]


# Function to count False values in the 'contains_2.1.1' column
def count_missing_versions(contains_version_list):
    return sum(not value for value in contains_version_list)

# Apply the function to each row and create a new column with the results
burst_ids['missing_CSLC_dates'] = burst_ids.apply(find_diff_dates, axis=1)

# Count total missing CSLC dates
burst_ids['count_missing_CSLCs'] = burst_ids['missing_CSLC_dates'].apply(len)

# Apply the function to create grouped duplicate_CSLC_dates and duplicate_CSLC_versions
burst_ids[['duplicate_CSLC_dates', 'duplicate_CSLC_versions']] = burst_ids.apply(
    lambda row: pd.Series(grouped_duplicates_with_versions(row['CSLC_sensing_dates'], row['CSLC_versions'])),
    axis=1
)

# Apply the function to calculate the total duplicates
burst_ids['count_CSLC_duplicates'] = burst_ids['duplicate_CSLC_dates'].apply(count_total_duplicates)

# Apply the function to create a new column 'contains_2.1.1'
burst_ids['contains_2.1.1'] = burst_ids['duplicate_CSLC_versions'].apply(check_contains_version)

# Apply the function to create the 'missing_CSLC_version' column
burst_ids['count_missing_CSLC_version_2.1.1'] = burst_ids['contains_2.1.1'].apply(count_missing_versions)

burst_ids.head()

### Make the final dataframe
Fields: `cslc_brustID`, `count_asfburst`, `count_operaburst`, `count_missing_cslcs`, `missing_CSLC_date`

In [None]:
# Get today's date in YYYYMMDD format
today = datetime.today().strftime('%Y%m%d')

missing_data = gpd.GeoDataFrame()
missing_data['CSLC_burst_id'] = burst_ids['CSLC_burst_id']
missing_data['count_asfburst'] = burst_ids['SLC_sensing_dates'].apply(len)
missing_data['count_operaburst'] = burst_ids['CSLC_sensing_dates'].apply(len)
missing_data['count_missing_CSLCs'] = burst_ids['missing_CSLC_dates'].apply(len)
missing_data['missing_CSLC_dates'] = burst_ids['missing_CSLC_dates']
missing_data['count_CSLC_duplicates'] = burst_ids['count_CSLC_duplicates']
missing_data['duplicate_CSLC_dates'] = burst_ids['duplicate_CSLC_dates']
missing_data['geometry'] = burst_ids['geometry']
missing_data.set_geometry('geometry', crs='EPSG:4326', inplace=True)
missing_data.to_file(f'missing_CSLC_bursts_20160701_20241115_P1_ss1.geojson', driver='GeoJSON')
missing_data.tail()

In [None]:
test = gpd.read_file('/Users/colespeed/opera/OPERA_Applications/CSLC/troubleshooting/priority_1/CSLC/missing_CSLC_bursts_20160701_20241115_P1_ss2.geojson')
print(f"Number of CSLC burst IDs investigated: {len(test)}")
print(f"Number of CSLC burst IDs with missing dates: {len(test[test['count_missing_CSLCs'] > 0])}")
print(f"Number of CSLC burst IDs with duplicates: {len(test[test['count_CSLC_duplicates'] > 0])}")
print(f"Number of total CSLC duplicates: {test['count_CSLC_duplicates'].sum()}")
print(f"Number of missing CSLC bursts: {test['count_missing_CSLCs'].sum()}")

### Report stats

In [None]:
print(f"Number of CSLC burst IDs investigated: {len(missing_data)}")
print(f"Number of CSLC burst IDs with missing dates: {len(missing_data[missing_data['count_missing_CSLCs'] > 0])}")
print(f"Number of CSLC burst IDs with duplicates: {len(missing_data[missing_data['count_CSLC_duplicates'] > 0])}")
print(f"Number of total CSLC duplicates: {missing_data['count_CSLC_duplicates'].sum()}")
print(f"Number of missing CSLC bursts: {missing_data['count_missing_CSLCs'].sum()}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.hist(missing_data['count_missing_CSLCs'], bins=100, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Histogram of Missing CSLC Counts (Increased Bins)', fontsize=16)
plt.xlabel('Count of Missing CSLCs', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### Export locations of missing CSLC data to HTML

In [None]:
# Filter out rows where count_missing_CSLCs is 0
filtered_data = missing_data[missing_data['count_missing_CSLCs'] != 0]

# Visualize the filtered data
map = filtered_data.explore(
    column='count_missing_CSLCs',  # Color by 'count_missing_CSLCs'
    cmap='viridis',                # Use the Viridis color map
    opacity=0.5,                   # Set transparency level for polygons
    legend=True                    # Add a color legend for the main data
)
# Save the combined map to an HTML file
map.save('missing_CSLC_bursts_20160701_20241115_P1_ss1.html')

### Read in the geojson
In the event that the data is not already in memory, we read it in below as a GeoDataFrame.

In [None]:
import re
from datetime import date
from datetime import datetime
import geopandas as gpd


# Function to parse the `missing_CSLC_dates` column
def parse_dates(date_str):
    if date_str == '[ ]':  # Handle empty lists
        return []
    
    # Match and extract datetime.date entries
    date_matches = re.findall(r"datetime\.date\((\d+), (\d+), (\d+)\)", date_str)
    
    # Convert matches to date objects (YYYY-MM-DD)
    return [date(year=int(y), month=int(m), day=int(d)) for y, m, d in date_matches]

# Read in the data
data = gpd.read_file('missing_CSLC_bursts_20160701_20241115_P1_ss1.geojson')
# Apply the function to the column
data['missing_CSLC_dates'] = data['missing_CSLC_dates'].apply(parse_dates)
data['duplicate_CSLC_dates'] = data['duplicate_CSLC_dates'].apply(parse_dates)
data.head()

Unnamed: 0,CSLC_burst_id,count_asfburst,count_operaburst,count_missing_CSLCs,missing_CSLC_dates,count_CSLC_duplicates,duplicate_CSLC_dates,geometry
0,t027_056712_iw1,217,219,0,[],2,"[2024-04-05, 2024-04-05, 2024-04-17, 2024-04-17]","POLYGON Z ((-109.92708 35.19724 0, -110.86164 ..."
1,t027_056712_iw2,217,219,0,[],2,"[2024-04-05, 2024-04-05, 2024-04-17, 2024-04-17]","POLYGON Z ((-110.84829 35.27344 0, -111.83056 ..."
2,t027_056712_iw3,217,219,0,[],2,"[2024-04-05, 2024-04-05, 2024-04-17, 2024-04-17]","POLYGON Z ((-111.81462 35.35996 0, -112.68407 ..."
3,t027_056713_iw1,217,219,0,[],2,"[2024-04-05, 2024-04-05, 2024-04-17, 2024-04-17]","POLYGON Z ((-109.96634 35.03124 0, -110.89889 ..."
4,t027_056713_iw2,217,219,0,[],2,"[2024-04-05, 2024-04-05, 2024-04-17, 2024-04-17]","POLYGON Z ((-110.88558 35.10745 0, -111.86628 ..."


### Decompose the df to all dates
Right now, the structure of the GeoDataFrame is such that, for a single CSLC burst ID, there may be one or more dates that are missing. To perform the query for their correponding SAFE file, we must break down the data to individual CSLC burst IDs and a single corresponding date.

In [62]:
import pandas as pd
# Initialize empty lists to store the results
cslc_burst_ids = []
slc_burst_ids = []
count_asfburst = []	
count_operaburst = []
count_missing_CSLCs = [] 
missing_dates = []
count_CSLC_duplicates = []
duplicate_CSLC_dates = []
geoms = []

# Iterate over each row in the DataFrame
for idx, row in data.iterrows():
    if row['missing_CSLC_dates']:  # Check if there are any missing dates
        # For each missing date, append the corresponding CSLC_burst_id and date
        for missing_date in row['missing_CSLC_dates']:
            cslc_burst_ids.append(row['CSLC_burst_id'])
            slc_burst_ids.append(re.sub(r'^t', '', row['CSLC_burst_id']))
            count_asfburst.append(row['count_asfburst'])
            count_operaburst.append(row['count_operaburst'])
            count_missing_CSLCs.append(row['count_missing_CSLCs'])
            missing_dates.append(missing_date)
            count_CSLC_duplicates.append(row['count_CSLC_duplicates'])
            #duplicate_CSLC_dates.append(row['duplicate_CSLC_dates'])
            geoms.append(row['geometry'])

# Create a new DataFrame with the decomposed lists
decomposed_df = pd.DataFrame({
    'CSLC_burst_id': cslc_burst_ids,
    'SLC_burst_id': slc_burst_ids,
    'count_asfburst': count_asfburst,
    'count_operaburst': count_operaburst,
    'count_missing_CSLCs': count_missing_CSLCs,
    'missing_CSLC_dates': missing_dates,
    'count_CSLC_duplicates': count_CSLC_duplicates,
    #'duplicate_CSLC_dates': duplicate_CSLC_dates,
    'geometry': geoms
})
print('total CSLC bursts with missing dates:', len(decomposed_df))
decomposed_df.head()

total CSLC bursts with missing dates: 14193


Unnamed: 0,CSLC_burst_id,SLC_burst_id,count_asfburst,count_operaburst,count_missing_CSLCs,missing_CSLC_dates,count_CSLC_duplicates,geometry
0,t027_056732_iw2,027_056732_iw2,250,251,1,2024-07-22,2,"POLYGON Z ((-111.577163 31.951283 0, -112.5217..."
1,t027_056738_iw1,027_056738_iw1,247,249,1,2024-07-22,3,"POLYGON Z ((-110.913907 30.8767 0, -111.803057..."
2,t027_056738_iw2,027_056738_iw2,247,249,1,2024-07-22,3,"POLYGON Z ((-111.790802 30.953879 0, -112.7252..."
3,t027_056753_iw3,027_056753_iw3,218,220,1,2023-01-17,3,"POLYGON Z ((-113.216971 28.54969 0, -114.02334..."
4,t027_056773_iw2,027_056773_iw2,215,216,1,2024-07-10,2,"POLYGON Z ((-113.014727 25.131766 0, -113.8986..."


### Map back to SLC SAFE file
We again use `asf_search()` to retreive the SAFE file corresponding to the CSLC burst ID and date. 

In [28]:
from datetime import timedelta

missing_CSLC_burst_id = []
missing_SLC_burst_id = []
missing_dates = []
SAFE_file_id = []
SLC_geoms = []

for i, burst in enumerate(decomposed_df['SLC_burst_id']):

    if i%10 == 0:   
        print(f"working on burst number {i} out of {len(decomposed_df['SLC_burst_id'])} total bursts: {burst}")
    
    results = asf.search(fullBurstID=[burst], start = (decomposed_df['missing_CSLC_dates'][i]), end = (decomposed_df['missing_CSLC_dates'][i] + timedelta(days=1)))
    file_id = results[0].umm['InputGranules']
    missing_CSLC_burst_id.append(decomposed_df['CSLC_burst_id'][i])
    missing_SLC_burst_id.append(decomposed_df['SLC_burst_id'][i])
    missing_dates.append(decomposed_df['missing_CSLC_dates'][i])
    SLC_geoms.append(decomposed_df['geometry'][i])
    SAFE_file_id.append(file_id)

working on burst number 0 out of 27 total bursts: 100_213572_iw1
working on burst number 10 out of 27 total bursts: 100_213572_iw3
working on burst number 20 out of 27 total bursts: 100_213576_iw1


In [None]:
slc = gpd.GeoDataFrame()
slc['CSLC_burst_id'] = missing_CSLC_burst_id
slc['SLC_burst_id'] = missing_SLC_burst_id
slc['missing_CSLC_dates'] = missing_dates
slc['fileID'] = [ids[0] if isinstance(ids, list) else ids for ids in SAFE_file_id]
slc['geometry'] = SLC_geoms
slc.head()

27


Unnamed: 0,CSLC_burst_id,SLC_burst_id,missing_CSLC_dates,fileID,geometry
0,t100_213572_iw1,100_213572_iw1,2022-04-09,S1A_IW_SLC__1SDV_20220409T133823_20220409T1338...,
1,t100_213572_iw1,100_213572_iw1,2022-04-21,S1A_IW_SLC__1SDV_20220421T133824_20220421T1338...,
2,t100_213572_iw1,100_213572_iw1,2022-05-03,S1A_IW_SLC__1SDV_20220503T133824_20220503T1338...,
3,t100_213572_iw1,100_213572_iw1,2022-05-15,S1A_IW_SLC__1SDV_20220515T133825_20220515T1338...,
4,t100_213572_iw1,100_213572_iw1,2022-05-27,S1A_IW_SLC__1SDV_20220527T133826_20220527T1338...,


In [None]:
# Drop duplicates
slc_unique = slc.drop_duplicates(subset=["CSLC_burst_id", "missing_CSLC_dates"], keep="first")
len(slc_unique)

In [None]:
# Output to a file (feel free to drop some columns, if desired)
slc.to_file('missing_SAFE_files_20160701_20241115_ss1.geojson', driver='GeoJSON')

In [None]:
# Visualize the results
slc.explore()

<h2 style="color:red;">Testing: Parallelize the process (Currently not recommended, due to CMR issues)</h2>


### Scale down the dataframe and add a column for SLC burst IDs

In [None]:
burst_ids = pd.DataFrame()
burst_ids['CSLC_burst_id'] = result_df_intersecting['burst_id_jpl']
burst_ids['SLC_burst_id'] = burst_ids['CSLC_burst_id'].str.replace('^t', '', regex=True)
burst_ids['geometry'] = result_df_intersecting['geometry']
burst_ids = burst_ids.reset_index(drop=True)
burst_ids.head()

In [None]:
from concurrent.futures import ThreadPoolExecutor

def fetch_results(opts):
    return asf.search(**opts)

opts_list = [
    {
        'dataset': 'SLC-BURST',
        'fullBurstID': list(burst_ids['SLC_burst_id']),
        'polarization': 'VV'
    },
    {
        'dataset': 'OPERA-S1',
        'processingLevel': 'CSLC',
        'operaBurstID': list(burst_ids['CSLC_burst_id'])
    }
]

# Use ThreadPoolExecutor to run API calls in parallel
results = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(fetch_results, opts) for opts in opts_list]
    for future in futures:
        results.append(future.result())

# Unpack the results
SLC_burst_results, CSLC_burst_results = results

### Parse the metadata and add dates/versions to the dataframe

In [None]:
from collections import defaultdict
import re

# SLCs

# Minimum date filter
min_date = datetime(2016, 7, 1).date()

# Create a dictionary from results
fileID_dict_SLC = defaultdict(list)
for result in SLC_burst_results:
    match = re.search(r'(\d{6}_IW\d)', result.properties['fileID'])
    if match:
        key = match.group(1).lower()  # Normalize case for matching
        # Extract and store only the date part (YYYY-MM-DD)
        date_only = datetime.strptime(result.properties['stopTime'], '%Y-%m-%dT%H:%M:%SZ').date()
        if date_only >= min_date:
            fileID_dict_SLC[key].append(date_only)

# Sort the dates for each fileID
for key in fileID_dict_SLC:
    fileID_dict_SLC[key] = sorted(fileID_dict_SLC[key])

# Map sorted dates to the DataFrame
def get_SLC_dates(row):
    # Extract the relevant portion of the 'id' column
    match = re.search(r'(\d{6}_iw\d)', row['SLC_burst_id'], re.IGNORECASE)
    if match:
        key = match.group(1).lower()
        return fileID_dict_SLC.get(key, [])
    return []

# Apply function to the DataFrame
burst_ids['SLC_sensing_dates'] = burst_ids.apply(get_SLC_dates, axis=1)

In [None]:
# CSLCs

# Minimum date filter
min_date = datetime(2016, 7, 1).date()

# Create dictionaries for dates and versions
fileID_dict_CSLC = defaultdict(list)
version_dict = defaultdict(list)

for result in CSLC_burst_results:
    match = re.search(r'T(\d{3})-(\d{6})-IW(\d)', result.properties['fileID'])
    if match:
        key = f"t{match.group(1)}_{match.group(2)}_iw{match.group(3)}".lower()  # Normalize to match 'id_2'
        # Extract and store only the date part (YYYY-MM-DD)
        date_only = datetime.strptime(result.properties['stopTime'], '%Y-%m-%dT%H:%M:%SZ').date()
        if date_only >= min_date:
            fileID_dict_CSLC[key].append(date_only)
            version_dict[key].append((date_only, result.properties['pgeVersion']))  # Store date-version pairs

# Sort the dates and corresponding versions for each fileID
for key in fileID_dict_CSLC:
    # Sort dates
    fileID_dict_CSLC[key] = sorted(fileID_dict_CSLC[key])
    # Sort versions by date
    version_dict[key] = [version for date, version in sorted(version_dict[key])]

# Map sorted dates and versions to the DataFrame
def get_CSLC_dates_and_versions(row):
    key = row['CSLC_burst_id'].lower()
    return fileID_dict_CSLC.get(key, []), version_dict.get(key, [])

# Apply function to the DataFrame
burst_ids[['CSLC_sensing_dates', 'versions']] = burst_ids.apply(
    lambda row: pd.Series(get_CSLC_dates_and_versions(row)),
    axis=1
)
burst_ids.head()

### Find SLC burst dates that do not have corresponding CSLC dates and find duplicates
Create a two new columns in the dataframe containing the dates for which SLC bursts exist with no corresponding CSLC bursts and a count of the number of duplicated CSLCs.

In [None]:
from collections import Counter

# Find missing CSLC dates
def find_diff_dates(row):
    cslc_dates_set = set(row['CSLC_sensing_dates'])
    slc_dates_set = set(row['SLC_sensing_dates'])
    diff_dates = list(slc_dates_set - cslc_dates_set)
    diff_dates.sort()
    return diff_dates

# Function to find grouped duplicates with their corresponding versions
def grouped_duplicates_with_versions(dates, versions):
    # Count occurrences of each date
    counts = Counter(dates)
    
    # Group duplicates and their corresponding versions
    grouped_duplicates = defaultdict(list)  # To store duplicates and their versions
    
    for date, version in zip(dates, versions):
        if counts[date] > 1:  # Check if the date is a duplicate
            grouped_duplicates[date].append(version)
    
    # Convert the dictionary to nested lists of duplicates and versions
    duplicate_dates = [[date] * len(versions) for date, versions in grouped_duplicates.items()]
    duplicate_versions = list(grouped_duplicates.values())
    
    return duplicate_dates, duplicate_versions

# Function to count total duplicates
def count_total_duplicates(grouped_dates):
    return sum(len(group) for group in grouped_dates)

# Function to check if each inner list contains '2.1.1'
def check_contains_version(duplicate_versions, target_version='2.1.1'):
    return [target_version in inner_list for inner_list in duplicate_versions]


# Function to count False values in the 'contains_2.1.1' column
def count_missing_versions(contains_version_list):
    return sum(not value for value in contains_version_list)

# Apply the function to each row and create a new column with the results
burst_ids['missing_CSLC_dates'] = burst_ids.apply(find_diff_dates, axis=1)

# Count total missing CSLC dates
burst_ids['count_missing_CSLCs'] = burst_ids['missing_CSLC_dates'].apply(len)

# Apply the function to create grouped duplicate_CSLC_dates and duplicate_CSLC_versions
burst_ids[['duplicate_CSLC_dates', 'duplicate_CSLC_versions']] = burst_ids.apply(
    lambda row: pd.Series(grouped_duplicates_with_versions(row['CSLC_sensing_dates'], row['CSLC_versions'])),
    axis=1
)

# Apply the function to calculate the total duplicates
burst_ids['count_CSLC_duplicates'] = burst_ids['duplicate_CSLC_dates'].apply(count_total_duplicates)

# Apply the function to calculate the total duplicates
burst_ids['count_CSLC_duplicates'] = burst_ids['duplicate_CSLC_dates'].apply(count_total_duplicates)

# Apply the function to create a new column 'contains_2.1.1'
burst_ids['contains_2.1.1'] = burst_ids['duplicate_CSLC_versions'].apply(check_contains_version)

# Apply the function to create the 'missing_CSLC_version' column
burst_ids['count_missing_CSLC_version_2.1.1'] = burst_ids['contains_2.1.1'].apply(count_missing_versions)

burst_ids.head(5)

### Make the final dataframe
Fields: `cslc_brustID`, `count_asfburst`, `count_operaburst`, `count_missing_cslcs`, `missing_CSLC_date`

In [None]:
# Get today's date in YYYYMMDD format
today = datetime.today().strftime('%Y%m%d')

missing_data = gpd.GeoDataFrame()
missing_data['CSLC_burst_id'] = burst_ids['CSLC_burst_id']
missing_data['count_asfburst'] = burst_ids['SLC_sensing_dates'].apply(len)
missing_data['count_operaburst'] = burst_ids['CSLC_sensing_dates'].apply(len)
missing_data['count_missing_CSLCs'] = burst_ids['missing_CSLC_dates'].apply(len)
missing_data['missing_CSLC_dates'] = burst_ids['missing_CSLC_dates']
missing_data['count_CSLC_duplicates'] = burst_ids['count_CSLC_duplicates']
missing_data['count_duplicates_missing_CSLC_version_2.1.1'] = burst_ids['count_missing_CSLC_version_2.1.1']
missing_data['geometry'] = burst_ids['geometry']
missing_data.set_geometry('geometry', crs='EPSG:4326', inplace=True)
missing_data.to_file(f'missing_CSLC_bursts_20160701_{today}.geojson', driver='GeoJSON')
missing_data.head()

### Report stats

In [None]:
print(f"Number of CSLC burst IDs investigated: {len(burst_ids)}")
print(f"Number of CSLC burst IDs with missing dates: {len(data[data['count_missing_CSLCs'] > 0])}")
print(f"Number of CSLC burst IDs with duplicates: {len(data[data['count_CSLC_duplicates'] > 0])}")
print(f"Number of total CSLC duplicates: {data['count_CSLC_duplicates'].sum()}")
print(f"Number of duplicates missing version 2.1.1: {data['count_duplicates_missing_CSLC_version_2.1.1'].sum()}")
print(f"Number of missing CSLC bursts: {data['count_missing_CSLCs'].sum()}")
print(f"Number of CSLCs burst IDs missing zero dates: {len(data[data['count_missing_CSLCs'] == 0])}")
print(f"Number of CSLCs burst IDs missing one date: {len(data[data['count_missing_CSLCs'] == 1])}")
print(f"Number of CSLCs burst IDs missing two dates: {len(data[data['count_missing_CSLCs'] == 2])}")
print(f"Number of CSLCs burst IDs missing three dates: {len(data[data['count_missing_CSLCs'] == 3])}")
print(f"Most frequently missed date: {data['missing_CSLC_dates'].explode().mode()[0]}, with {data['missing_CSLC_dates'].explode().value_counts().max()} missing dates")
print(f"Most frequently missed CSLC burst ID: {data.loc[data['count_missing_CSLCs'].idxmax(), 'CSLC_burst_id']}, with {data['count_missing_CSLCs'].max()} missing dates")