# SWISSIMAGE loader

1) [Production of extents](#production-of-extents) \
    1.1) [Loading and data visualization](#loading-and-data-visualization) \
    1.2) [Add flight_year to points](#add-flight_year-to-points) \
    1.2) [Add flight_year to merged polygons](#add-flight_year-to-merged-polygons) 
2) [Processing of extents](#processing-of-extents) \
    2.1) [Binarization of masks](#binarization-of-masks) \
    2.2) [Splitting images into 512x512 subset](#splitting-images-into-512x512-subset) \
    2.3) [Remove tiles that don't contain landslides](#remove-tiles-that-dont-contain-landslides) \
    2.4) [Sort into subfolders](#sort-into-subfolders)
3) [Find mask with different years](#find-mask-with-different-years)

## Dependencies

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from copy import deepcopy

## Production of extents

### Loading and data visualization

In [2]:
src_polygons = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\BD glissements spontanés\BE_SL_all_aggreg_PQ.shp"
src_points_intersect = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\points_landslide_bern_intersect_polygons.gpkg"
src_km_grid = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\swissIMAGE_HIST.gpkg"
src_landslides_merged = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\landslides_anriss_auslauf_merged.gpkg"

In [3]:
points_intersect = gpd.read_file(src_points_intersect)
print("Number of years: ", len(points_intersect.year.unique()))
print(np.sort(points_intersect.year.unique()))
points_intersect['flight_year'] = np.zeros(len(points_intersect))
print(points_intersect.columns)

Number of years:  24
[1962 1974 1980 1985 1992 1993 1998 1999 2000 2004 2005 2006 2007 2008
 2009 2010 2012 2014 2015 2016 2017 2018 2021 2022]
Index(['RS_ID', 'year', 'outline', 'source', 'ID_alt', 'Anrissmäc', 'Volumen',
       'ID', 'Jahr', 'Datum', 'SHAPE_Leng', 'SHAPE_Area', 'geometry',
       'flight_year'],
      dtype='object')


In [4]:
# load grid
grid = gpd.read_file(src_km_grid)
print(grid.head())
print(grid.flight_year)


                                         flight_year resolution_of_origin  \
0  1926,1943,1946,1954,1969,1975,1976,1981,1987,1...        50,100,,25,10   
1  1926,1943,1946,1954,1955,1956,1969,1975,1976,1...        50,100,,25,10   
2  1926,1943,1946,1955,1956,1969,1975,1976,1981,1...        50,100,,25,10   
3  1926,1931,1946,1951,1962,1965,1969,1971,1975,1...        50,100,,25,10   
4  1926,1943,1946,1955,1956,1969,1975,1976,1981,1...        50,100,,25,10   

      id     left      top    right   bottom  \
0  45727  2670000  1206000  2671000  1205000   
1  46200  2672000  1201000  2673000  1200000   
2  46201  2672000  1200000  2673000  1199000   
3  45024  2667000  1207000  2668000  1206000   
4  46202  2672000  1199000  2673000  1198000   

                                            geometry  
0  MULTIPOLYGON (((2670601.819 1206000, 2671000 1...  
1  MULTIPOLYGON (((2672000 1201000, 2673000 12010...  
2  MULTIPOLYGON (((2672000 1199083.191, 2672000 1...  
3  MULTIPOLYGON (((2667000 1

### Add flight_year to points

In [7]:
lst_points_too_recent = []
lst_flight_years = np.zeros(len(points_intersect), dtype=int)
for point in tqdm(points_intersect.itertuples(), total=len(points_intersect)):
    intersection_mask = grid.geometry.contains(point.geometry)#.intersection(grid.geometry)
    intersection = grid[intersection_mask]

    lst_years = []
    if np.sum(intersection_mask) != 1:
        print("Problem (no matching polygon) with : ", point)
        continue

    try:
        flight_year = np.min([int(x) for x in intersection.flight_year.values[0].split(',') if int(x) >= point.Jahr])
    except:
        lst_points_too_recent.append(point)
        continue

    lst_flight_years[point.Index] = flight_year

points_intersect['flight_year'] = lst_flight_years

100%|██████████| 556/556 [00:03<00:00, 176.64it/s]


In [8]:
print("Number of points with year too high for the grid: ", np.sum(lst_flight_years == 0))
print("Different flight years: ", set(points_intersect.flight_year.values))

Number of points with year too high for the grid:  15
Different flight years:  {np.int64(0), np.int64(1933), np.int64(1934), np.int64(1936), np.int64(1939), np.int64(1940), np.int64(1944), np.int64(1945), np.int64(1946), np.int64(1999), np.int64(2000), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2012), np.int64(2015), np.int64(2016), np.int64(2021)}


In [9]:
points_intersect.to_file(src_points_intersect.split('.gpkg')[0] + '_w_flight_year.gpkg', driver="GPKG")

### Add flight_year to merged polygons

In [10]:
merged_polygons = gpd.read_file(src_landslides_merged)
# print(merged_polygons.index)points_intersect = gpd.read_file(src_points_intersect.split('.gpkg')[0] + '_w_flight_year.gpkg')
points_intersect = points_intersect[points_intersect.flight_year >= 1998]
flight_year = np.zeros(len(merged_polygons), dtype=int)
list_polygons_too_old = []
for polygon in tqdm(merged_polygons.itertuples(), total=len(merged_polygons)):
    points = [p for p in points_intersect.itertuples() if polygon.geometry.contains(p.geometry)]
    flight_years = [x.flight_year for x in points]
    if len(set(flight_years)) == 1:
        flight_year[polygon.Index] = flight_years[0]
    elif len(set(flight_years)) > 1:
        flight_year[polygon.Index] = np.bincount(flight_years).argmax()
        print([x.flight_year for x in points])
        print(flight_year[polygon.Index])
    else:
        list_polygons_too_old.append(polygon)

  0%|          | 0/498 [00:00<?, ?it/s]

 58%|█████▊    | 289/498 [00:01<00:01, 155.53it/s]

[2021, 2007, 2007]
2007


100%|██████████| 498/498 [00:03<00:00, 150.96it/s]


In [11]:
print("Number of polygons too old: ", len(list_polygons_too_old))

Number of polygons too old:  140


In [15]:
merged_polygons['flight_year'] = flight_year
merged_polygons.to_file(src_landslides_merged.split('.gpkg')[0] + "_w_flight_year.gpkg", driver="GPKG")

In [14]:
print("Different years: ", sorted(set(merged_polygons.flight_year)))
print(len(merged_polygons))

Different years:  [0, 1999, 2000, 2004, 2005, 2006, 2007, 2012, 2015, 2016, 2021]
498


## Processing of extents

In [43]:
src_data = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\final_extent"
src_tiles = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\final_extent\tiles"
src_no_landslide = os.path.join(src_tiles, "no_landslide")
src_images = os.path.join(src_tiles, 'images')
src_masks = os.path.join(src_tiles, 'labels')
src_masks_bin = os.path.join(src_tiles, 'masks')

### Binarization of masks

In [44]:

list_images_to_process = [os.path.join(src_data, x) for x in os.listdir(src_data) if x.split('.')[-1].lower() in ['png','jpg','jpeg'] and 'mask_bin' not in x]
list_masks = [x for x in list_images_to_process if 'mask' in x]

In [45]:
for _, mask in tqdm(enumerate(list_masks),total=len(list_masks)):
    img  = Image.open(mask)
    img_arr = np.array(img)
    img_arr_rgb = img_arr[..., 0:3]
    img_bin = np.sum(img_arr_rgb, axis=2)
    img_bin[img_bin != 0] = 1
    result = Image.fromarray(img_bin.astype(np.uint8))
    result_src = ''.join(mask.replace("mask", "bin").split('.')[0:-1]) + '.tif'
    result.save(result_src)
    list_images_to_process.append(result_src)
    

100%|██████████| 449/449 [00:24<00:00, 18.47it/s]


### Splitting images into 512x512 subset

In [56]:
def tile_image_overlap(in_path, tile_size=512, overlap=0, save_dir=None, verbose=False):
    save_dir = os.path.join(os.path.dirname(in_path), 'tiles') if save_dir == None else save_dir
    ext = in_path.split('.')[-1]
    os.makedirs(save_dir, exist_ok=True)

    img = Image.open(in_path)
    w, h = img.size

    tile_id = 0
    for y in range(0, h, tile_size):
        for x in range(0, w, tile_size):
            # manage if borders reached
            if x + tile_size > w:
                x = w - tile_size
            if y + tile_size > h:
                y = h - tile_size

            # Crop region (handles border tiles automatically)
            tile = img.crop((x, y, x + tile_size, y + tile_size))
            output_path = os.path.join(save_dir, ''.join(os.path.basename(in_path).split('.')[:-1]) + f"_{tile_id}.tif")
            tile.save(output_path)
            tile_id += 1
    if verbose:
        print(f"Saved {tile_id} overlapping tiles.")
    return tile_id

In [57]:
num_samples = 0
# temp_list = [os.path.join(src_data, x) for x in os.listdir(src_data) if x.endswith('tif')]
for _ ,image in tqdm(enumerate(list_images_to_process), total=len(list_images_to_process)):
    num_samples += tile_image_overlap(image)

print("Number of created tiles: ", num_samples)

100%|██████████| 1347/1347 [00:52<00:00, 25.82it/s] 

Number of created tiles:  14055





### Remove tiles that don't contain landslides

In [58]:
list_masks = [os.path.join(src_tiles, x) for x in os.listdir(src_tiles) if 'mask' in x and not 'bin' in x]
list_masks_bin = [os.path.join(src_tiles, x) for x in os.listdir(src_tiles) if 'bin' in x]

In [59]:

print("Number of masks: ", len(list_masks))
print("Number of masks bin: ", len(list_masks_bin))
# assert len(list_masks) == len(list_masks_bin)

Number of masks:  4685
Number of masks bin:  4685


In [60]:
lst_no_correspondance = []
for mask in list_masks:
    if not os.path.exists(mask.replace('mask', 'image')):
        lst_no_correspondance.append(mask)
if len(lst_no_correspondance) > 0:
    print("The following masks don't have corresponding image:")
    for x in lst_no_correspondance:
        print(f"\t{x}")

In [None]:
os.makedirs(src_no_landslide, exist_ok=True)

for _, mask_bin in tqdm(enumerate(list_masks_bin), total=len(list_masks_bin)):
    img = Image.open(mask_bin)
    img_arr = np.array(img)
    assert set(img_arr.flatten()) in [{0}, {1}, {0, 1}]
    img.close()
    if set(img_arr.flatten()) == {0}:
        mask = mask_bin.replace('bin', 'mask')
        image = mask.replace('mask', 'image')
        os.rename(mask_bin, os.path.join(src_no_landslide, os.path.basename(mask_bin)))
        os.rename(mask, os.path.join(src_no_landslide, os.path.basename(mask)))
        os.rename(image, os.path.join(src_no_landslide, os.path.basename(image)))


100%|██████████| 4685/4685 [02:33<00:00, 30.53it/s]


### Sort into subfolders

In [63]:
os.makedirs(src_images, exist_ok=True)
os.makedirs(src_masks, exist_ok=True)
os.makedirs(src_masks_bin, exist_ok=True)

In [64]:
list_tiles = [os.path.join(src_tiles, x) for x in os.listdir(src_tiles) if not os.path.isdir(os.path.join(src_tiles,x))]

print(len(list_tiles))

for _, tile_src in tqdm(enumerate(list_tiles), total=len(list_tiles)):
    if 'image' in tile_src:
        os.rename(tile_src, os.path.join(src_images, os.path.basename(tile_src).replace('image_', '')))
    elif 'mask' in tile_src:
        os.rename(tile_src, os.path.join(src_masks, os.path.basename(tile_src).replace('mask_', '')))
    elif 'bin' in tile_src:
        os.rename(tile_src, os.path.join(src_masks_bin, os.path.basename(tile_src).replace('bin_', '')))
    else:
        print(f"No category for: {tile_src}")

10683


100%|██████████| 10683/10683 [00:01<00:00, 5751.78it/s]


## Find mask with different years
### (when comparing '>' and '>=' versions)

In [2]:
src_mask_1 = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\masks_1"
src_mask_2 = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\masks_2"
src_img_1 = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\img_1"
src_img_2 = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\img_2"

In [3]:
# loading of all masks
# lst_masks_1 = [os.path.join(src_mask_1, x) for x in os.listdir(src_mask_1)]
# lst_masks_2 = [os.path.join(src_mask_2, x) for x in os.listdir(src_mask_2)]
lst_masks_1 = os.listdir(src_mask_1)
lst_masks_2 = os.listdir(src_mask_2)

lst_masks_1_data = {
    x: np.asarray(Image.open(os.path.join(src_mask_1, x))) for _, x in tqdm(enumerate(lst_masks_1), total=len(lst_masks_1))
}
lst_masks_2_data = {
    x: np.asarray(Image.open(os.path.join(src_mask_2, x))) for _, x in tqdm(enumerate(lst_masks_2), total=len(lst_masks_2))
}

100%|██████████| 449/449 [00:04<00:00, 98.75it/s] 
100%|██████████| 358/358 [00:03<00:00, 91.55it/s] 


In [4]:
print(list(lst_masks_1_data.values())[0].shape)
print(list(lst_masks_2_data.values())[0].shape)

(1200, 1400, 4)
(1500, 900, 4)


In [30]:
# comparison
lst_different_year = []
lst_not_found = []
lst_found = []
lst_problems = []
for _, (mask_name, mask_data) in tqdm(enumerate(lst_masks_1_data.items()), total=len(lst_masks_1)):
    # is_different = False
    # if mask not in lst_masks_2:
    #     is_different = True
    # elif lst_masks_1_data[mask] != lst_masks_2_data[mask]:
    corresponding_masks = [x for x,y in lst_masks_2_data.items() if y.shape == mask_data.shape and (y == mask_data).all()]

    if len(corresponding_masks) == 0:
        # print(f"{mask_name} not found")
        lst_not_found.append(mask_name)
    elif len(corresponding_masks) > 1:
        # print("PROBLEM!")
        lst_problems.append(mask_name)
    else:
        mask_name_2 = corresponding_masks[0]
        year_1 = mask_name.split('_')[1]
        year_2 = mask_name_2.split('_')[1]
        if year_1 != year_2:
            lst_different_year.append([
                os.path.join(src_mask_1, mask_name),
                os.path.join(src_mask_2, mask_name_2),
                os.path.join(src_img_1, mask_name.replace('mask', 'image')),
                os.path.join(src_img_2, mask_name_2.replace('mask', 'image')),
            ])
        else:
            lst_found.append(mask_name)

100%|██████████| 449/449 [00:06<00:00, 74.06it/s] 


In [31]:
print("Number of samples the same year: ", len(lst_found))
print("Number of samples not the same year: ", len(lst_different_year))
print("Number of samples not found: ", len(lst_not_found))
print("Number of problems: ", len(lst_problems))

Number of samples the same year:  281
Number of samples not the same year:  73
Number of samples not found:  95
Number of problems:  0


In [None]:
#showing samples to compare

for sample in lst_different_year:
    mask = Image.open(sample[0])
    img1 = Image.open(sample[2])
    img2 = Image.open(sample[3])

    fig, axs = plt.subplots(1,3, figsize=(15,4))
    axs[0].imshow(img1)
    axs[1].imshow(mask)
    axs[2].imshow(img2)

    year_1 = os.path.basename(sample[2]).split('_')[1]
    year_2 = os.path.basename(sample[3]).split('_')[1]
    axs[0].set_title(year_1)
    axs[2].set_title(year_2)
    plt.suptitle(os.path.basename(sample[0]))
    plt.show()
    plt.close()
    

In [42]:

def copyfile_and_mask(src_img, src_mask, dest):
    dest_img = os.path.join(dest, os.path.basename(src_img))
    dest_mask = os.path.join(dest, os.path.basename(src_mask))
    if os.path.exists(dest_img):
        print(f"{dest_img} already exists!")
        year = os.path.basename(dest_img).split('_')[1]
        num_max = np.max([int(x.split('_')[-1].split('.')[0]) for x in os.listdir(os.path.dirname(dest_img)) if year in x])
        dest_img = dest_img.split(year+"_")[0] + f"{year}_" + str(num_max + 1) + ".png"
        dest_mask = os.path.join(os.path.dirname(dest_img), os.path.basename(dest_img).replace('image', 'mask'))
        print(f"new name: {dest_img}")
        print('---')
    
    shutil.copyfile(src_img, dest_img)
    shutil.copyfile(src_mask, dest_mask)

# save final selection
src_selection = r"D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\selection_for_different.csv"
list_selection = pd.read_csv(src_selection, sep=';').selection.values
final_dest = os.path.join(os.path.dirname(src_mask_1), 'final_extent')
os.makedirs(final_dest, exist_ok=True)
for _, mask in tqdm(enumerate(lst_found), total=len(lst_found)):
    img = mask.replace('mask', "image")
    copyfile_and_mask(
        os.path.join(src_img_1, img),
        os.path.join(src_mask_1, mask),
        final_dest
    )
for _, mask in tqdm(enumerate(lst_not_found), total=len(lst_not_found)):
    img = mask.replace('mask', "image")
    copyfile_and_mask(
        os.path.join(src_img_1, img),
        os.path.join(src_mask_1, mask),
        final_dest
    )
for id_item, val in tqdm(enumerate(list_selection), total=len(list_selection)):
# for _, mask in tqdm(enumerate(lst_found), total=len(lst_found)):
    source = src_img_1 if val == 1 else src_img_2
    img_src = lst_different_year[id_item][2] if val == 1 else lst_different_year[id_item][3]
    mask_src = os.path.join(src_mask_1 if val == 1 else src_mask_2, os.path.basename(img_src).replace('image', 'mask'))
    # img = mask.replace('mask', "image")
    copyfile_and_mask(
        img_src,
        mask_src,
        final_dest
    )


100%|██████████| 281/281 [00:00<00:00, 445.73it/s]
100%|██████████| 95/95 [00:00<00:00, 554.95it/s]
100%|██████████| 73/73 [00:00<00:00, 419.93it/s]

D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2007_25.png already exists!
new name: D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2007_117.png
---
D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2000_1.png already exists!
new name: D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2000_2.png
---
D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2000_2.png already exists!
new name: D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2000_3.png
---
D:\Terranum_SD\99_Data\Landslide\data\Bern_glissements_spontane_shpfiles\test_different_years\final_extent\image_2015_43.png already exists!
new name: D:\Terranum_




# Trash

In [None]:
list_masks_bin = [os.path.join(src_no_landslide, x) for x in os.listdir(src_no_landslide) if "mask_bin" in x and x.endswith('.tif')]
print(len(list_masks_bin))
for mask in list_masks_bin:
    os.rename(mask, mask.replace('mask_bin', 'bin'))

1124


In [None]:
list_img = [os.path.join(src_no_landslide, x) for x in os.listdir(src_no_landslide) if not os.path.isdir(os.path.join(src_no_landslide,x)) and ".tif" not in x]
print(len(list_img))
for _, img_src in tqdm(enumerate(list_img), total=len(list_img)):
    img = Image.open(img_src)
    img.save(''.join(img_src.split('.')[:-1]) + '.tif')
    os.remove(img_src)

2248


100%|██████████| 2248/2248 [00:27<00:00, 82.36it/s] 


In [16]:
confmat = pd.read_csv(r"D:\GitHubProjects\Terranum_repo\LandSlides\segformerlandslides\results\training\20251216_093805_50_epochs_Bern_b0_from_scratch\logs\confmats\values\confusion_matrix_ep_6.0.csv", sep=';', index_col = 0)
print(confmat.head())
print(confmat.values)
print(confmat.values.dtype)

           0         1
0  710720621  15972302
1   32867394  80349059
[[710720621  15972302]
 [ 32867394  80349059]]
int64
