In [2]:
# IMPORT LIBRARIES

import pandas as pd
import glob
import os
from tqdm import tqdm

In [3]:
# FIND MAX PIXELS FOR EACH SENSOR

INPUT_FOLDER = '../../local_data/EarthEngineResultsRF'
OUTPUT_L8 = '../../local_data/ListofImages/metadata_L8.csv'
OUTPUT_L9 = '../../local_data/ListofImages/metadata_L9.csv'
OUTPUT_S2 = '../../local_data/ListofImages/metadata_S2.csv'

def split_csvs_by_sensor():
    csv_files = glob.glob(os.path.join(INPUT_FOLDER, '*.csv'))
    print(f"Found {len(csv_files)} CSV files")

    # Lists to hold dataframes for each sensor
    l8_list = []
    l9_list = []
    s2_list = []
    
    for file in tqdm(csv_files, desc = 'Separating sensors'):
        try:
            # Read CSV
            df = pd.read_csv(file)
            
            # Normalize columns
            df.columns = [c.strip().lower() for c in df.columns]
            
            if 'sensor' in df.columns:
                # Clean up sensor column (remove whitespace)
                df['sensor'] = df['sensor'].astype(str).str.strip()
                
                # Filter and append to respective lists
                # We copy() to avoid SettingWithCopy warnings
                l8_data = df[df['sensor'] == 'Landsat8'].copy()
                if not l8_data.empty:
                    l8_list.append(l8_data)
                    
                l9_data = df[df['sensor'] == 'Landsat9'].copy()
                if not l9_data.empty:
                    l9_list.append(l9_data)
                    
                s2_data = df[df['sensor'] == 'Sentinel2'].copy()
                if not s2_data.empty:
                    s2_list.append(s2_data)
                    
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Helper function to save list to CSV
    def save_list_to_csv(df_list, filename):
        if df_list:
            full_df = pd.concat(df_list, ignore_index=True)
            full_df.to_csv(filename, index=False)
            print(f"Saved {len(full_df)} rows to {filename}")
        else:
            print(f"No data found for {filename}")

    # Save files
    save_list_to_csv(l8_list, OUTPUT_L8)
    save_list_to_csv(l9_list, OUTPUT_L9)
    save_list_to_csv(s2_list, OUTPUT_S2)

split_csvs_by_sensor()

Found 1077 CSV files


Separating sensors: 100%|██████████| 1077/1077 [00:02<00:00, 486.29it/s]


Saved 120120 rows to ../../local_data/ListofImages/metadata_L8.csv
Saved 11958 rows to ../../local_data/ListofImages/metadata_L9.csv
Saved 72461 rows to ../../local_data/ListofImages/metadata_S2.csv


In [4]:
# just to check what max_pixels are looking like

L8 = pd.read_csv('../../local_data/ListofImages/metadata_L8.csv')
L9 = pd.read_csv('../../local_data/ListofImages/metadata_L9.csv')
S2 = pd.read_csv('../../local_data/ListofImages/metadata_S2.csv')
L8['total_pixels'] = L8['total_pixels'].astype(int)
L9['total_pixels'] = L9['total_pixels'].astype(int)
S2['total_pixels'] = S2['total_pixels'].astype(int)

In [7]:
S2

Unnamed: 0,system:index,area,cloud_frac,column,date,hazy_ice_frac,hazy_water_frac,land_frac,ocean_frac,pond_frac,row,sea_ice_frac,sensor,snow_frac,total_pixels,.geo,year
0,000000000000000019be,5.757198e+08,0.000000,356,2022-06-14,0.000000,0.000000,0.0,0.866430,0.104459,103,0.133570,Sentinel2,0.0,1142758,"{""type"":""Polygon"",""coordinates"":[[[-67.6865266...",2022
1,00000000000000001a4a,5.750966e+08,0.000000,357,2022-07-12,0.000000,2.363427,0.0,0.999974,0.000113,104,0.000026,Sentinel2,0.0,380500,"{""type"":""Polygon"",""coordinates"":[[[-67.1219419...",2022
2,00000000000000001ad0,5.765665e+08,0.000000,356,2022-07-12,0.000000,0.442957,0.0,0.999991,0.000033,105,0.000009,Sentinel2,0.0,896805,"{""type"":""Polygon"",""coordinates"":[[[-66.8823356...",2022
3,00000000000000001ad1,5.755148e+08,0.000000,357,2022-09-24,0.000000,0.000000,0.0,1.000000,0.000000,105,0.000000,Sentinel2,0.0,1248365,"{""type"":""Polygon"",""coordinates"":[[[-66.7210507...",2022
4,00000000000000001b52,5.811304e+08,0.000000,352,2022-06-22,0.000000,0.000000,0.0,0.594349,0.145232,106,0.405651,Sentinel2,0.0,1170391,"{""type"":""Polygon"",""coordinates"":[[[-67.1354918...",2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72456,00000000000000000955,5.021612e+08,0.004058,151,2021-02-25,2.066869,0.000125,0.0,0.000338,0.001413,74,0.999662,Sentinel2,0.0,415323,"{""type"":""Polygon"",""coordinates"":[[[178.7410717...",2021
72457,000000000000000009f7,5.958169e+08,0.000000,151,2021-04-14,0.362230,0.000000,0.0,0.000000,0.000000,75,1.000000,Sentinel2,0.0,773137,"{""type"":""Polygon"",""coordinates"":[[[178.5855767...",2021
72458,00000000000000000a98,5.972568e+08,0.000000,152,2021-04-14,0.453989,0.000000,0.0,0.000000,0.000000,76,1.000000,Sentinel2,0.0,1045314,"{""type"":""Polygon"",""coordinates"":[[[178.5679038...",2021
72459,00000000000000000a99,5.979876e+08,0.000000,153,2021-04-14,1.093666,0.000000,0.0,0.009543,0.016503,76,0.990457,Sentinel2,0.0,604516,"{""type"":""Polygon"",""coordinates"":[[[178.9190758...",2021


In [6]:
# filter for pixel sizes

S2['date'] = pd.to_datetime(S2['date'])
S2['year'] = S2['date'].dt.year
S2['total_pixels'] = S2['total_pixels'].astype(int)
S2_filtered = S2[S2['total_pixels'].between(585540, 731880)]
print(f'{len(S2_filtered)} images for S2')

L8['date'] = pd.to_datetime(L8['date'])
L8['year'] = L8['date'].dt.year
L8['total_pixels'] = L8['total_pixels'].astype(int)
L8_filtered = L8[L8['total_pixels'].between(585540, 731880)]
print(f'{len(L8_filtered)} images for L8')

L9['date'] = pd.to_datetime(L9['date'])
L9['year'] = L9['date'].dt.year
L9['total_pixels'] = L9['total_pixels'].astype(int)
L9_filtered = L9[L9['total_pixels'].between(585540, 731880)]
print(f'{len(L9_filtered)} images for L9')

1471 images for S2
4226 images for L8
376 images for L9


In [42]:
# sample

S2_sample = S2_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)
L8_sample = L8_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)
L9_sample = L9_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)

# save all samples

S2_sample.to_csv('../../local_data/ListofImages/S2samples.csv', index = False)
L8_sample.to_csv('../../local_data/ListofImages/L8samples.csv', index = False)
L9_sample.to_csv('../../local_data/ListofImages/L9samples.csv', index = False)

In [None]:
sample_metadata

In [None]:
# CONFIGURATION

INPUT_FOLDER = '../../local_data/ArcticTensorsRaw'
OUTPUT_FILE = 'check_samples_dynamic.csv'
SAMPLE_SIZE = 40

In [25]:
def create_gee_upload_csv():
    # Define your thresholds here
    # These names must match the 'sensor' column values in your CSVs
    THRESHOLDS = {
        'Landsat8': 5102547,
        'Landsat9': 4545984,
        'Sentinel2': 4891425,  
        'DEFAULT': 100000000    
    }

    # find all CSVs
    csv_files = glob.glob(os.path.join(INPUT_FOLDER, '*.csv'))
    print(f"Found {len(csv_files)} CSV files.")
    df_list = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df.columns = [c.strip().lower() for c in df.columns]
            
            if 'total_pixels' in df.columns and 'sensor' in df.columns:
                # Ensure data types are correct
                df['total_pixels'] = pd.to_numeric(df['total_pixels'], errors='coerce')
                
                # Apply sensor-specific filtering
                # We use .apply() to check each row against our dictionary
                def filter_by_sensor(row):
                    sensor = str(row['sensor']).strip()
                    threshold = THRESHOLDS.get(sensor, THRESHOLDS['DEFAULT'])
                    return row['total_pixels'] > threshold

                valid_rows = df[df.apply(filter_by_sensor, axis=1)].copy()
                
                if not valid_rows.empty:
                    # Normalizing Column Names for GEE
                    if 'column' in valid_rows.columns:
                        valid_rows.rename(columns={'column': 'Col'}, inplace=True)
                    elif 'col' in valid_rows.columns:
                        valid_rows.rename(columns={'col': 'Col'}, inplace=True)
                        
                    valid_rows.rename(columns={
                        'row': 'Row', 
                        'date': 'Date', 
                        'scene_id': 'Scene_ID',
                        'sensor': 'Sensor',
                        'total_pixels' : 'total_pixels'
                    }, inplace=True)
                    
                    keep_cols = ['Row', 'Col', 'Date', 'Scene_ID', 'Sensor', 'total_pixels']
                    final_cols = [c for c in keep_cols if c in valid_rows.columns]
                    df_list.append(valid_rows[final_cols])
                    
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if not df_list:
        print("No valid data found.")
        return

    full_df = pd.concat(df_list, ignore_index=True)
    print(f"Total valid cells found: {len(full_df)}")

    # sampling and export (same as before)
    if len(full_df) > SAMPLE_SIZE:
        sampled_df = full_df.sample(n=SAMPLE_SIZE)
    else:
        sampled_df = full_df
        print(f"Warning: Only found {len(full_df)} cells. Using all.")

    sampled_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Success! Saved {len(sampled_df)} samples to {OUTPUT_FILE}")

In [26]:
# RUN FUNCTION

create_gee_upload_csv()

Found 90 CSV files.
Total valid cells found: 176
Success! Saved 40 samples to check_samples_dynamic.csv


In [23]:
samples_test = pd.read_csv('check_samples_static.csv')
samples_test

Unnamed: 0,Row,Col,Date,Sensor
0,74,214,2024-06-12,Landsat8
1,98,293,2024-04-22,Landsat8
2,118,302,2024-04-23,Landsat8
3,205,204,2024-07-07,Landsat8
4,114,262,2024-04-30,Landsat8
...,...,...,...,...
95,96,247,2024-05-17,Landsat8
96,179,199,2024-04-21,Landsat8
97,129,259,2024-04-24,Landsat8
98,123,156,2024-03-21,Landsat8


In [None]:
samples = pd.read_csv('../../local_data/ListofImages/sample_images.csv')
samples_balanced = (
    samples
    .groupby("sensor", group_keys=False)
    .sample(n=15, random_state=12)
)
samples_balanced.to_csv('../../local_data/ListofImages/sample_images_balanced.csv', index = False)

# check list

68, 318, 2020-04-15
196, 195, 2024-06-20
140, 163, 2024-05-30
124, 253, 2024-05-09
123, 285, 2023-04-13
73, 214, 2023-03-29
178, 268, 2021-04-24
246, 222, 2017-04-24
251, 321, 2016-06-14
88, 258, 2013-08-10
73, 160, 2013-04-11
116, 270, 2017-05-02
82	206	2024-06-12
201	193	2024-04-15
75	187	2023-05-21
126	324	2023-03-06
100	116	2022-07-22
120	304	2022-05-05
100	240	2024-06-04
107	166	2024-04-24
208	324	2024-04-22
147	181	2021-05-27
81	241	2021-05-15
69	259	2021-04-14
203	199	2019-05-04
86	240	2018-06-03
200	221	2018-04-27
132	302	2018-05-12
122	321	2017-07-29
76	176	2017-04-18

Unnamed: 0,system:index,area_m2,cloud_cover,col,date,row,sensor,total_pixels,.geo,year
131,00000000000000001aa9,635635100.0,0.01,290,2016-03-24,105,Landsat8,705543,"{""type"":""MultiPoint"",""coordinates"":[]}",2016
191,00000000000000003ca8,647813000.0,0.0,191,2022-06-25,190,Landsat8,719138,"{""type"":""MultiPoint"",""coordinates"":[]}",2022
126,00000000000000004169,568410700.0,0.03,239,2015-07-19,200,Landsat8,630869,"{""type"":""MultiPoint"",""coordinates"":[]}",2015
199,0000000000000000169e,637030600.0,0.0,280,2022-04-18,98,Landsat8,707182,"{""type"":""MultiPoint"",""coordinates"":[]}",2022
172,00000000000000001879,610797000.0,0.0,322,2020-04-07,101,Landsat8,678078,"{""type"":""MultiPoint"",""coordinates"":[]}",2020
151,00000000000000003831,594942600.0,0.0,282,2018-04-26,182,Landsat8,660981,"{""type"":""MultiPoint"",""coordinates"":[]}",2018
171,00000000000000001750,609782300.0,0.0,322,2020-04-07,99,Landsat8,677126,"{""type"":""MultiPoint"",""coordinates"":[]}",2020
101,00000000000000000ffe,603229100.0,0.5,318,2013-05-02,85,Landsat8,670052,"{""type"":""MultiPoint"",""coordinates"":[]}",2013
197,00000000000000000dc3,634842200.0,0.0,254,2022-04-29,81,Landsat8,704794,"{""type"":""MultiPoint"",""coordinates"":[]}",2022
111,0000000000000000336c,653788000.0,0.0,263,2014-08-17,172,Landsat8,725814,"{""type"":""MultiPoint"",""coordinates"":[]}",2014
