In [4]:
# IMPORT LIBRARIES

import pandas as pd
import glob
import os
from tqdm import tqdm

In [36]:
# FIND MAX PIXELS FOR EACH SENSOR

INPUT_FOLDER = '../../local_data/ArcticTensorsRaw'
OUTPUT_L8 = '../../local_data/ListofImages/metadata_L8.csv'
OUTPUT_L9 = '../../local_data/ListofImages/metadata_L9.csv'
OUTPUT_S2 = '../../local_data/ListofImages/metadata_S2.csv'

def split_csvs_by_sensor():
    csv_files = glob.glob(os.path.join(INPUT_FOLDER, '*.csv'))
    print(f"Found {len(csv_files)} CSV files")

    # Lists to hold dataframes for each sensor
    l8_list = []
    l9_list = []
    s2_list = []
    
    for file in tqdm(csv_files, desc = 'Separating sensors'):
        try:
            # Read CSV
            df = pd.read_csv(file)
            
            # Normalize columns
            df.columns = [c.strip().lower() for c in df.columns]
            
            if 'sensor' in df.columns:
                # Clean up sensor column (remove whitespace)
                df['sensor'] = df['sensor'].astype(str).str.strip()
                
                # Filter and append to respective lists
                # We copy() to avoid SettingWithCopy warnings
                l8_data = df[df['sensor'] == 'Landsat8'].copy()
                if not l8_data.empty:
                    l8_list.append(l8_data)
                    
                l9_data = df[df['sensor'] == 'Landsat9'].copy()
                if not l9_data.empty:
                    l9_list.append(l9_data)
                    
                s2_data = df[df['sensor'] == 'Sentinel2'].copy()
                if not s2_data.empty:
                    s2_list.append(s2_data)
                    
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Helper function to save list to CSV
    def save_list_to_csv(df_list, filename):
        if df_list:
            full_df = pd.concat(df_list, ignore_index=True)
            full_df.to_csv(filename, index=False)
            print(f"Saved {len(full_df)} rows to {filename}")
        else:
            print(f"No data found for {filename}")

    # Save files
    save_list_to_csv(l8_list, OUTPUT_L8)
    save_list_to_csv(l9_list, OUTPUT_L9)
    save_list_to_csv(s2_list, OUTPUT_S2)

split_csvs_by_sensor()

Found 222 CSV files


Separating sensors: 100%|██████████| 222/222 [00:00<00:00, 408.10it/s]


Saved 110878 rows to ../../local_data/ListofImages/metadata_L8.csv
Saved 8379 rows to ../../local_data/ListofImages/metadata_L9.csv
Saved 85735 rows to ../../local_data/ListofImages/metadata_S2.csv


In [38]:
# just to check what max_pixels are looking like

L8 = pd.read_csv('../../local_data/ListofImages/metadata_L8.csv')
L9 = pd.read_csv('../../local_data/ListofImages/metadata_L9.csv')
S2 = pd.read_csv('../../local_data/ListofImages/metadata_S2.csv')
L8['total_pixels'] = L8['total_pixels'].astype(int)
L9['total_pixels'] = L9['total_pixels'].astype(int)
S2['total_pixels'] = S2['total_pixels'].astype(int)

In [39]:
# filter for pixel sizes

S2['date'] = pd.to_datetime(S2['date'])
S2['year'] = S2['date'].dt.year
S2['total_pixels'] = S2['total_pixels'].astype(int)
S2_filtered = S2[S2['total_pixels'].between(622098, 731880)]

L8['date'] = pd.to_datetime(L8['date'])
L8['year'] = L8['date'].dt.year
L8['total_pixels'] = L8['total_pixels'].astype(int)
L8_filtered = L8[L8['total_pixels'].between(622098, 731880)]

L9['date'] = pd.to_datetime(L9['date'])
L9['year'] = L9['date'].dt.year
L9['total_pixels'] = L9['total_pixels'].astype(int)
L9_filtered = L9[L9['total_pixels'].between(622098, 731880)]

In [42]:
# sample

S2_sample = S2_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)
L8_sample = L8_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)
L9_sample = L9_filtered.groupby('year', group_keys=False).sample(n=10, random_state = 12)

# save all samples

S2_sample.to_csv('../../local_data/ListofImages/S2samples.csv', index = False)
L8_sample.to_csv('../../local_data/ListofImages/L8samples.csv', index = False)
L9_sample.to_csv('../../local_data/ListofImages/L9samples.csv', index = False)

In [None]:
sample_metadata

In [None]:
# CONFIGURATION

INPUT_FOLDER = '../../local_data/ArcticTensorsRaw'
OUTPUT_FILE = 'check_samples_dynamic.csv'
SAMPLE_SIZE = 40

In [25]:
def create_gee_upload_csv():
    # Define your thresholds here
    # These names must match the 'sensor' column values in your CSVs
    THRESHOLDS = {
        'Landsat8': 5102547,
        'Landsat9': 4545984,
        'Sentinel2': 4891425,  
        'DEFAULT': 100000000    
    }

    # find all CSVs
    csv_files = glob.glob(os.path.join(INPUT_FOLDER, '*.csv'))
    print(f"Found {len(csv_files)} CSV files.")
    df_list = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df.columns = [c.strip().lower() for c in df.columns]
            
            if 'total_pixels' in df.columns and 'sensor' in df.columns:
                # Ensure data types are correct
                df['total_pixels'] = pd.to_numeric(df['total_pixels'], errors='coerce')
                
                # Apply sensor-specific filtering
                # We use .apply() to check each row against our dictionary
                def filter_by_sensor(row):
                    sensor = str(row['sensor']).strip()
                    threshold = THRESHOLDS.get(sensor, THRESHOLDS['DEFAULT'])
                    return row['total_pixels'] > threshold

                valid_rows = df[df.apply(filter_by_sensor, axis=1)].copy()
                
                if not valid_rows.empty:
                    # Normalizing Column Names for GEE
                    if 'column' in valid_rows.columns:
                        valid_rows.rename(columns={'column': 'Col'}, inplace=True)
                    elif 'col' in valid_rows.columns:
                        valid_rows.rename(columns={'col': 'Col'}, inplace=True)
                        
                    valid_rows.rename(columns={
                        'row': 'Row', 
                        'date': 'Date', 
                        'scene_id': 'Scene_ID',
                        'sensor': 'Sensor',
                        'total_pixels' : 'total_pixels'
                    }, inplace=True)
                    
                    keep_cols = ['Row', 'Col', 'Date', 'Scene_ID', 'Sensor', 'total_pixels']
                    final_cols = [c for c in keep_cols if c in valid_rows.columns]
                    df_list.append(valid_rows[final_cols])
                    
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if not df_list:
        print("No valid data found.")
        return

    full_df = pd.concat(df_list, ignore_index=True)
    print(f"Total valid cells found: {len(full_df)}")

    # sampling and export (same as before)
    if len(full_df) > SAMPLE_SIZE:
        sampled_df = full_df.sample(n=SAMPLE_SIZE)
    else:
        sampled_df = full_df
        print(f"Warning: Only found {len(full_df)} cells. Using all.")

    sampled_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Success! Saved {len(sampled_df)} samples to {OUTPUT_FILE}")

In [26]:
# RUN FUNCTION

create_gee_upload_csv()

Found 90 CSV files.
Total valid cells found: 176
Success! Saved 40 samples to check_samples_dynamic.csv


In [23]:
samples_test = pd.read_csv('check_samples_static.csv')
samples_test

Unnamed: 0,Row,Col,Date,Sensor
0,74,214,2024-06-12,Landsat8
1,98,293,2024-04-22,Landsat8
2,118,302,2024-04-23,Landsat8
3,205,204,2024-07-07,Landsat8
4,114,262,2024-04-30,Landsat8
...,...,...,...,...
95,96,247,2024-05-17,Landsat8
96,179,199,2024-04-21,Landsat8
97,129,259,2024-04-24,Landsat8
98,123,156,2024-03-21,Landsat8


In [None]:
samples = pd.read_csv('check_samples_dynamic.csv')
samples_test = samples.head(20)
samples_test.to_csv('test_samples_dynamic.csv')
samples_test

Unnamed: 0,Row,Col,Date,Sensor
0,126,269,2024-04-21,Landsat8
1,179,263,2024-07-16,Landsat8
2,161,358,2024-06-13,Landsat8
3,127,285,2024-07-08,Landsat8
4,140,180,2024-06-18,Landsat8
...,...,...,...,...
95,93,172,2024-06-22,Landsat8
96,97,285,2024-05-11,Landsat8
97,143,172,2024-06-17,Landsat8
98,64,232,2024-04-17,Landsat8
