This notebook presents the use of embeddings from the MajorTom dataset. They were extracted from Sentinel-2 L1C satellite imagery. These embeddings can be used for tasks like classification, regression and change detection. The example demonstrates how embeddings facilitate  efficient processing of large satellite datasets

## Importing the required libraries

In [None]:
import os
import re
import io
from io import BytesIO
import boto3
import pyarrow.dataset as ds
import pyarrow.fs as fs
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering
import matplotlib.pyplot as plt
from matplotlib import cm, colors
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
import branca.colormap as bcm
from staticmap import StaticMap, CircleMarker
from PIL import Image
from IPython.display import display
from collections import defaultdict

## Defining a bounding box and searching for them in a parquet file In this example, the data covers the area around Girona. The task is to classify the land into three categories: forests, highly urbanized areas, low urbanization/farmland

## Steps:
## 1. Connecting to the S3 service
## 2. Defining AOI
## 3. Filtering data based on the AOI

In [None]:
s3_client = boto3.client(
    's3',
    endpoint_url='https://eodata.dataspace.copernicus.eu',
#    aws_access_key_id='YOUR KEY',
#    aws_secret_access_key='YOUR ACCESS',
    region_name='default'
)

bucket_name = "eodata"
prefix = "auxdata/MajorTOM/embeddings/Core-S2L1C-SSL4EO/"

# your bounding box (AOI) - Girona (example)
lon_min, lon_max =  2.78, 2.9
lat_min, lat_max = 41.9, 42

response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
parquet_files = [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith(".parquet")]

s3_fs = fs.S3FileSystem(
#    access_key='YOUR KEY',
#    secret_key='YOUR ACCESS',
    endpoint_override='https://eodata.dataspace.copernicus.eu'
)

results = []

for key in parquet_files:
    s3_path = f"{bucket_name}/{key}"
    dataset = ds.dataset(s3_path, filesystem=s3_fs, format="parquet")

    table = dataset.to_table(
        filter=(
            (ds.field("centre_lon") >= lon_min) &
            (ds.field("centre_lon") <= lon_max) &
            (ds.field("centre_lat") >= lat_min) &
            (ds.field("centre_lat") <= lat_max)
        ),
        columns=["grid_cell", "centre_lon", "centre_lat"]
    )

    if table.num_rows > 0:
        df_tmp = table.to_pandas()
        df_tmp["parquet_file"] = key
        results.append(df_tmp)


if results:
    df_result = pd.concat(results, ignore_index=True)
    df_result = df_result.drop_duplicates()
    print("Found data")

else:
    print("No grid cells were found in the bounding box")

## Get list of files

In [None]:
if results:
    df_result = pd.concat(results, ignore_index=True)
    df_result = df_result.drop_duplicates()

    unique_parquets = df_result['parquet_file'].unique()
    print("List of files:")
    for f in unique_parquets:
        print(f)
else:
    print("No grid cells were found in the bounding box")

## Loading filtred 
 
Steps:
1. Connecting to the S3 service
2. Loading filtred parquets

In [None]:
s3 = boto3.resource(
    's3',
    endpoint_url='https://eodata.dataspace.copernicus.eu',
#    aws_access_key_id='YOUR KEY',
#    aws_secret_access_key='YOUR ACCESS',
    region_name='default'
)  
all_dfs = []

for parquet_file in df_result['parquet_file'].unique():

    
    obj = s3.Object(bucket_name, parquet_file)
    file_stream = io.BytesIO(obj.get()['Body'].read())
    
    table = pq.read_table(file_stream)
    df_parquet = table.to_pandas()
    
    grid_cells = df_result[df_result['parquet_file'] == parquet_file]['grid_cell'].unique()
    df_filtered = df_parquet[df_parquet['grid_cell'].isin(grid_cells)].copy()
    
    all_dfs.append(df_filtered)


df_embeddings = pd.concat(all_dfs, ignore_index=True)
for grid, group in df_embeddings.groupby('grid_cell'):
    group = group.sort_values(['centre_lon', 'centre_lat'], ascending=[True, False])
    embeds = np.stack(group['embedding'].values)
    idxs = group.index

    if embeds.shape[0] != 25:
        continue

    mat = embeds.reshape(5, 5, -1)
    mat_T = mat.transpose(1, 0, 2)

    df_embeddings.loc[idxs, 'embedding'] = pd.Series(
        list(mat_T.reshape(25, -1)), index=idxs
    )
print(f"Loaded filtered parquets")

## Analytical module

In this example there are presented the possibilities of classifying areas by similarity

Steps:

1. Collecting all embeddings into the matrix 

2. Computing cosine similarity between embeddings

3. Using SpectralClustering method to split the points into n-groups based on how similar they are

4. Saving the results in the new column

5. Reducing the embeddings to two dimension with PCA

6. Visualisation of the results


In [None]:
X = np.stack(df_embeddings['embedding'].values)
cos_mat = cosine_similarity(X)
cos_mat[cos_mat < 0] = 0


n_clusters =3
spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
labels = spectral.fit_predict(cos_mat)
df_embeddings['cluster'] = labels

cmap = cm.get_cmap('tab20', n_clusters)
label_to_color = {lbl: colors.rgb2hex(cmap(i)) for i, lbl in enumerate(range(n_clusters))}
df_embeddings['color'] = df_embeddings['cluster'].map(label_to_color)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=df_embeddings['color'], alpha=0.7, s=20)
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('PCA - Spectral Clustering')
plt.grid(True)
plt.show()

def plot_clusters_staticmap(df):
    m = StaticMap(1800, 800, url_template='http://a.tile.openstreetmap.org/{z}/{x}/{y}.png')

    for row in df.itertuples():
        m.add_marker(CircleMarker((row.centre_lon, row.centre_lat), row.color, 12))

    image = m.render(zoom=10)
    image.show() 

plot_clusters_staticmap(df_embeddings)

In [None]:
m_spectral = folium.Map(
    location=[df_embeddings['centre_lat'].mean(), df_embeddings['centre_lon'].mean()],
    zoom_start=10,
    tiles='CartoDB positron'
)

for idx, row in df_embeddings.iterrows():
    folium.Circle(
        location=[row['centre_lat'], row['centre_lon']],
        radius=500,
        color=row['color'],
        fill=True,
        fill_color=row['color'],
        fill_opacity=0.7,
        weight=0,
        tooltip=f"Cluster: {row['cluster']}"
    ).add_to(m_spectral)

display(m_spectral)

## Presentation of random images from different clusters. The images come from MajorTOM resources.

Steps:

1. Finding the correct S3 folder for each parquet file and load available thumbnails

2. Picking a few random examples from each cluster and crop the images to the specified area

3. Showing the images with their coordinates

In [None]:
s3_client = boto3.client(
    's3',
    endpoint_url='https://eodata.dataspace.copernicus.eu',
#    aws_access_key_id='YOUR KEY',
#    aws_secret_access_key='YOUR ACCESS',
    region_name='default'
)
bucket_name = "eodata"

def resolve_core_folder(parquet_path: str) -> str:
    parquet_name = os.path.basename(parquet_path).replace(".parquet", "")
    m = re.match(r"part_(\d+)-(\d+)", parquet_name)
    if not m:
        raise ValueError(f"Error - parquet name: {parquet_name}")
    start = int(m.group(1))

    core_prefix = "auxdata/MajorTOM/Core-S2L1C/"
    resp = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=core_prefix, Delimiter="/")
    for cp in resp.get("CommonPrefixes", []):
        folder = cp["Prefix"].rstrip("/")
        base = os.path.basename(folder)
        m2 = re.match(r"part_(\d+)-(\d+)", base)
        if not m2:
            continue
        left, right = int(m2.group(1)), int(m2.group(2))
        if left <= start <= right:
            return folder + "/"
    raise ValueError(f"Not found: {parquet_path}")

def load_thumbnail_from_s3(grid_cell, core_folders):
 
    for folder in core_folders:
        key = f"{folder}{grid_cell}/thumbnail.png"
        try:
            obj = s3_client.get_object(Bucket=bucket_name, Key=key)
            img_bytes = obj['Body'].read()
            img = Image.open(BytesIO(img_bytes))
            return img
        except:
            continue
    return None

def show_examples_per_cluster(df, unique_parquets, n_examples=3):
    core_folders = []
    for pq in unique_parquets:
        try:
            core_folders.append(resolve_core_folder(pq))
        except Exception as e:
            print(e)
    if not core_folders:
        print("Not found folder Core-S2L1C.")
        return

    clusters = df['cluster'].unique()

    for cl in clusters:
        subset = df[df['cluster'] == cl].sample(
            n=min(n_examples, (df['cluster'] == cl).sum()),
            replace=False
        )

        
        plt.figure(figsize=(n_examples * 3, 3))
        plt.suptitle(f"Cluster {cl}", y=1.05, fontsize=14)

        for idx, (_, row) in enumerate(subset.iterrows()):
            img = load_thumbnail_from_s3(row['grid_cell'], core_folders)
            if img is None:
                print(f"No image for grid_cell: {row['grid_cell']}")
                continue

            img_cropped = img.crop(row['pixel_bbox'])

            ax = plt.subplot(1, n_examples, idx + 1)
            ax.imshow(img_cropped)
            ax.set_title(
                f"lat={row['centre_lat']:.5f}\nlon={row['centre_lon']:.5f}",
                fontsize=9
            )
            ax.axis('off')

        plt.tight_layout()
        plt.show()


show_examples_per_cluster(df_embeddings, unique_parquets, n_examples=3)


In [None]:
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np

def com(df, unique_parquets, cols=6, alpha=0.15):
    
    core_folders = []
    for pq in unique_parquets:
        try:
            core_folders.append(resolve_core_folder(pq))
        except Exception as e:
            print(e)
    if not core_folders:
        print("Not found folder Core-S2L1C.")
        return


    grid_cells = df['grid_cell'].unique()
    print(f"Found {len(grid_cells)} unique grid_cells.")


    clusters = sorted(df['cluster'].unique())
    cmap = plt.cm.get_cmap("tab20", len(clusters))
    cluster_colors = {cl: cmap(i) for i, cl in enumerate(clusters)}

   
    thumbs = []
    for gc in grid_cells:
        img = load_thumbnail_from_s3(gc, core_folders)
        if img is None:
            print(f"No thumbnail for {gc}")
            continue
        thumbs.append((gc, img))

    
    n = len(thumbs)
    rows = (n + cols - 1) // cols
    plt.figure(figsize=(cols * 4, rows * 4))

    for i, (gc, img) in enumerate(thumbs):
        ax = plt.subplot(rows, cols, i + 1)
        ax.imshow(img)

        rows_for_gc = df[df['grid_cell'] == gc]

        for _, row in rows_for_gc.iterrows():
            cluster = row['cluster']
            bbox = row['pixel_bbox']  

            x1, y1, x2, y2 = bbox
            width = x2 - x1
            height = y2 - y1

            rect = patches.Rectangle(
                (x1, y1),
                width,
                height,
                linewidth=1.5,
                edgecolor=cluster_colors[cluster],
                facecolor=cluster_colors[cluster],
                alpha=alpha
            )
            ax.add_patch(rect)

        ax.set_title(gc, fontsize=9)
        ax.axis('off')

    plt.tight_layout()
    plt.show()
com(df_embeddings, unique_parquets)


## Finding the most similar patches in clusters

Steps

1. Storing indices, cosine similarity, and location/grid cell info for pairs in the same cluster

2. Selecting the pairs with highest similarity per cluster

3. Listing the top pairs

In [None]:
cluster_pairs = defaultdict(list)
N = len(df_embeddings)

for i in range(N):
    for j in range(i+1, N):
        lat_i, lon_i = df_embeddings.iloc[i]['centre_lat'], df_embeddings.iloc[i]['centre_lon']
        lat_j, lon_j = df_embeddings.iloc[j]['centre_lat'], df_embeddings.iloc[j]['centre_lon']

        if lat_i == lat_j and lon_i == lon_j:
            continue

        cluster_i = df_embeddings.iloc[i]['cluster']
        cluster_j = df_embeddings.iloc[j]['cluster']

        if cluster_i != cluster_j:
            continue

        grid_cell_i = df_embeddings.iloc[i]['grid_cell']
        grid_cell_j = df_embeddings.iloc[j]['grid_cell']

        cluster_pairs[cluster_i].append((
            i,
            j,
            cos_mat[i, j],
            (lat_i, lon_i, grid_cell_i),
            (lat_j, lon_j, grid_cell_j)
        ))


top_pairs_per_cluster = {}
for cluster, pairs in cluster_pairs.items():
    if pairs:
        pairs_sorted = sorted(pairs, key=lambda x: x[2], reverse=True)[:5]
        top_pairs_per_cluster[cluster] = pairs_sorted


for cluster, pairs in top_pairs_per_cluster.items():
    print(f"CLUSTER {cluster}")
    for idx, (i, j, score, cell_i, cell_j) in enumerate(pairs, 1):
        print(f"  Pair {idx}: {i} and {j} | Cosine similarity: {score:.4f}")
        print(f"    Patch 1: lat={cell_i[0]:.6f}, lon={cell_i[1]:.6f}, grid_cell={cell_i[2]}")
        print(f"    Patch 2: lat={cell_j[0]:.6f}, lon={cell_j[1]:.6f}, grid_cell={cell_j[2]}")
    print()

## Images of the most similar patches in clusters

Steps:

1. Identifying the correct S3 folder for each parquet file and loading thumbnails for the grid cells

2. Selecting the top pairs per cluster with the highest cosine similarity

3. Cropping the images to the defined area and displaying them 

In [None]:
s3_client = boto3.client(
    's3',
    endpoint_url='https://eodata.dataspace.copernicus.eu',
#    aws_access_key_id='YOUR KEY',
#    aws_secret_access_key='YOUR ACCESS',
    region_name='default'
)
bucket_name = "eodata"

def resolve_core_folder(parquet_path: str) -> str:
    parquet_name = os.path.basename(parquet_path).replace(".parquet", "")
    m = re.match(r"part_(\d+)-(\d+)", parquet_name)
    if not m:
        raise ValueError(f"Error (name of parquet)")
    start = int(m.group(1))

    core_prefix = "auxdata/MajorTOM/Core-S2L1C/"
    resp = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=core_prefix, Delimiter="/")
    for cp in resp.get("CommonPrefixes", []):
        folder = cp["Prefix"].rstrip("/")
        base = os.path.basename(folder)
        m2 = re.match(r"part_(\d+)-(\d+)", base)
        if not m2:
            continue
        left, right = int(m2.group(1)), int(m2.group(2))
        if left <= start <= right:
            return folder + "/"
    raise ValueError(f"Not found file for {parquet_path}")

def load_thumbnail_from_s3(grid_cell, core_folders):
  
    for folder in core_folders:
        key = f"{folder}{grid_cell}/thumbnail.png"
        try:
            obj = s3_client.get_object(Bucket=bucket_name, Key=key)
            img_bytes = obj['Body'].read()
            img = Image.open(BytesIO(img_bytes))
            return img
        except:
            continue
    return None

def show_top_pairs_per_cluster(df, top_pairs_per_cluster, unique_parquets, top_n=4):

    core_folders = []
    for pq in unique_parquets:
        try:
            core_folders.append(resolve_core_folder(pq))
        except Exception as e:
            print(e)
    if not core_folders:
        print("Not found file")
        return

    for cluster, pairs in top_pairs_per_cluster.items():
        for idx, (i, j, score, cell_i, cell_j) in enumerate(pairs[:top_n], 1):
            row_i = df[(df['centre_lat']==cell_i[0]) & (df['centre_lon']==cell_i[1])].iloc[0]
            row_j = df[(df['centre_lat']==cell_j[0]) & (df['centre_lon']==cell_j[1])].iloc[0]

            bbox_i = row_i['pixel_bbox']
            bbox_j = row_j['pixel_bbox']

            img_i = load_thumbnail_from_s3(row_i['grid_cell'], core_folders)
            img_j = load_thumbnail_from_s3(row_j['grid_cell'], core_folders)
            if img_i is None or img_j is None:
                print(f"No images for {cluster}, pair {idx}")
                continue

            img_i_cropped = img_i.crop(bbox_i)
            img_j_cropped = img_j.crop(bbox_j)

            fig, axes = plt.subplots(1, 2, figsize=(8,4))
            fig.suptitle(f"CLUSTER {cluster} - pair {idx}", fontsize=12)

            axes[0].imshow(img_i_cropped)
            axes[0].set_title(f"Patch 1:\nlat={cell_i[0]:.6f}\nlon={cell_i[1]:.6f}", fontsize=10)
            axes[0].axis('off')

            axes[1].imshow(img_j_cropped)
            axes[1].set_title(f"Patch 2:\nlat={cell_j[0]:.6f}\nlon={cell_j[1]:.6f}", fontsize=10)
            axes[1].axis('off')

            plt.tight_layout()
            fig.subplots_adjust(top=0.85, bottom=0.15)
            fig.text(0.5, 0.05, f"Cosine similarity: {score:.4f}", ha='center', fontsize=10)
            plt.show()



show_top_pairs_per_cluster(df_embeddings, top_pairs_per_cluster, unique_parquets, top_n=2)