In [None]:
import geopandas as gpd
from pyspatialml import Raster
# from copy import deepcopy
import os
import glob
# import tempfile
# import rasterio.plot
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize

import pandas as pd
import numpy as np

In [None]:
# set working directory
os.chdir("/Users/giandomenico/Documents/SAPIENZA/AR/ABDAC/def_regionale")

In [None]:
#%% Read Files
# set file paths
lip_path = "dati/landslides_lip_AOI.gpkg"
stable_path = "dati/stable_points_100k.gpkg"
 
lip = gpd.read_file(lip_path, include_fields=['ls_vel']) # LIP points
lip_slow = lip[lip.ls_vel == 'Slow'].drop(columns=['ls_vel'])
lip_slow['label'] = 1
lip_slow = lip_slow.to_crs('epsg:32632')

stable = gpd.read_file(stable_path, ignore_fields=['id']) # stable points
stable['label'] = 0
stable = stable.to_crs('epsg:32632')

In [None]:
#%% create dataset LIPs + stable pt
def sampling_perc(perc_rate_df1, df1, df2):
	df2_ = df2[:int((len(df1)/perc_rate_df1) - len(df1))]
	data = pd.concat([df1, df2_]).reset_index(drop=True)
	return data

lip_rate = 0.35
data = sampling_perc(lip_rate, lip_slow, stable)
# data['label'] = data['label'].astype('category')


In [None]:
#%% PREDICTORS
# Define the folder containing the raster files
folder_path = 'dati/predictors'
# Define the pattern to match all files with the .tif extension
file_pattern = "*.tif"
# Combine the folder path and file pattern to search for files
search_pattern = os.path.join(folder_path, file_pattern)

# Use glob.glob to find all files matching the pattern
predictors = glob.glob(search_pattern)

stack = Raster(predictors)
# stack.rename({}, inplace=True) # rename raster layers if needed

In [None]:
#plot points over raster
name = 'slope_deg'

stack[name].cmap = 'terrain'
stack[name].norm = Normalize(0, 45)
cmap = cm.get_cmap('Set1')

fig, ax = plt.subplots(figsize=(9, 9))
stack[name].plot(ax=ax, legend=True)
data.plot(column="label", ax=ax, legend=True, s=0.5, cmap=cmap)
plt.show()

In [None]:
# Extract data from rasters at the training point locations:
# fix numpy int deprecated
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

xy = stack.extract_vector(data)

x_stable = stack.extract_vector(stable)
#subsample x_stable based on slope and rel_relief.

x_lip = stack.extract_vector(lip_slow)

#%%
# join the extracted pixel data back with the training data
xy = xy.droplevel(0).merge(
    data.loc[:, ("label")], 
    left_index=True, 
    right_index=True
    )

# Filter out rows with -9999 or -999
xy = xy[(xy != -9999) & (xy != -999)].dropna()#.drop(columns='geometry_idx')

xy['label'] = xy['label'].astype('int')

# save dataset
xy.to_file("dati/xy.gpkg", driver='GPKG')

In [None]:
# Handling Categorical vars
# variables to encode
cat_var = ['lithology', 'wlc']
# change column type to be int
xy[cat_var] = xy[cat_var].apply(lambda col: col.astype(int), axis=1)
# change column type to be categorical
# xy[cat_var] = xy[cat_var].astype('category')

# Create subsets for model training
X = xy.drop(columns=['label', 'geometry']).values # select predictors
y = xy['label'].values # select label class 0/1


# save X, y
np.save("dati/X.npy", X)
np.save("dati/y.npy", y)

In [None]:
#%% Collinearity detection
import seaborn as sns

# xy = gpd.read_file("dati/xy.gpkg").drop(columns=['geometry_idx'])

def correlation_matrix(df: pd.DataFrame):
    """
    A function to calculate and plot
    correlation matrix of a DataFrame.
    """
    # Create the matrix
    matrix = df.drop(columns='geometry').corr(method='spearman')
    
    # Create cmap
    cmap = sns.diverging_palette(250, 15, s=75, l=40,
                             n=9, center="light", as_cmap=True)
    # Create a mask
    mask = np.triu(np.ones_like(matrix, dtype=bool))
    
    # Make figsize bigger
    fig, ax = plt.subplots(figsize=(15,10), dpi=300)
    
    # Plot the matrix
    _ = sns.heatmap(matrix, vmin=-1, vmax=1, mask=mask, center=0, annot=True,
             fmt='.2f', square=True, cmap=cmap, ax=ax, cbar=False, linewidths=0.1,
                   annot_kws={"fontsize":8})
    # plt.savefig('figure/heat_collinearity_spearman.png', format='png', dpi=600)
    
    return matrix
    
corr_matrix = correlation_matrix(xy)

#%% Multi Collinearity Removal
def remove_multicollinearity(data, exclude, threshold=.75):
    
    exclude = exclude if exclude is not None else []
    corr_matrix = data.corr(method='spearman', numeric_only=True)
    
    correlated_features = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname1 = corr_matrix.columns[i]
                colname2 = corr_matrix.columns[j]

                # Check against the exclusion list before adding to correlated_features
                if colname1 not in exclude:
                    correlated_features.add(colname1)
                if colname2 not in exclude and colname1 != colname2:
                    correlated_features.add(colname2)
    print(f"Columns to be removed due to correlation above {threshold}: {correlated_features}")
    
    return data.drop(columns=correlated_features, errors='ignore')

xy_clean = remove_multicollinearity(xy, exclude=['slope_deg','rel_relief', 'twi'], threshold=.80)

In [None]:
# clean LIPs removing points with LOW values of slope and rel_relief.
condition = (xy['rel_relief'] < 5) | (xy['slope_deg'] < 5 ) & (xy['label'] == 1)
# Drop rows where condition is True
xy_clean = xy.drop(xy[condition].index)

# clean stable removing points with HIGH values of slope and rel_relief.
condition = (xy_clean['rel_relief'] > 50) | (xy_clean['slope_deg'] > 45) & (xy_clean['label'] == 0)
# Drop rows where condition is True
xy_clean2 = xy_clean.drop(xy_clean[condition].index)


# SAVE XY Clean
xy_clean2.to_file("dati/xy_clean.gpkg", driver='GPKG')

In [None]:
# Data Visualization
DATA = xy_clean2
plt.figure(dpi=150)
sns.kdeplot(data=DATA, x="rel_relief", hue="label", cumulative=True)

plt.figure(dpi=150)
sns.kdeplot(data=DATA, x="slope_deg", y="rel_relief", hue="label", fill=True, levels=10, thresh=0.1, alpha=0.5)