In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj geopandas

In [3]:
import os
import sys
import numpy as np
import xarray as xr
import pandas as pd
import netCDF4 as nc
import geopandas as gpd
from pathlib import Path
from scipy.spatial import KDTree
from shapely.geometry import Point
# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

In [4]:
def analyze_nasadem_file(file_path: str) -> None:
    """
    Analyze a NASADEM NetCDF file and print out its contents, such as variable names,
    dimensions, and min/max values for longitude and latitude.

    Args:
    file_path (str): The path to the NASADEM NetCDF file.
    """
    # Open the NetCDF file
    dataset = nc.Dataset(file_path, 'r')

    # Print general information about the file
    print('##################################################')
    print(f"Analyzing file: {file_path}")
    print("Variables in this file:")
    for var in dataset.variables:
        print(f" - {var}: {dataset.variables[var].dimensions}, {dataset.variables[var].shape}")

    # Check for common variables like longitude and latitude
    if 'lon' in dataset.variables and 'lat' in dataset.variables:
        lon = dataset.variables['lon'][:]
        lat = dataset.variables['lat'][:]
        print(f"Longitude range: {np.min(lon)} to {np.max(lon)}")
        print(f"Latitude range: {np.min(lat)} to {np.max(lat)}")
    
    # Close the dataset
    dataset.close()

# Paths to your files
file_paths = [
    '../../data/original_data/jay_files/Topography_Products/Curvature.nc',
    '../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc',
    '../../data/original_data/jay_files/Topography_Products/Slope.nc',
    '../../data/original_data/jay_files/hydrogeology.nc'
]

# Analyze each file
for path in file_paths:
    analyze_nasadem_file(file_path=path)

##################################################
Analyzing file: ../../data/original_data/jay_files/Topography_Products/Curvature.nc
Variables in this file:
 - lon: ('lon',), (11915,)
 - lat: ('lat',), (2872,)
 - Curvatu_tif2: ('lat', 'lon'), (2872, 11915)
Longitude range: -17.078611111124776 to -13.769166666697293
Latitude range: 13.077222222232688 to 13.874722222230263
##################################################
Analyzing file: ../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc
Variables in this file:
 - lon: ('lon',), (10937,)
 - lat: ('lat',), (2873,)
 - Drainage_density: ('lat', 'lon'), (2873, 10937)
Longitude range: -16.806781756020996 to -13.769003978257615
Latitude range: 13.077083333343863 to 13.874861111119214
##################################################
Analyzing file: ../../data/original_data/jay_files/Topography_Products/Slope.nc
Variables in this file:
 - lon: ('lon',), (11915,)
 - lat: ('lat',), (2872,)
 - Slope_tif2: ('lat', 'lon')

In [5]:
import xarray as xr
import pandas as pd

def nc_to_dataframe(file_path: str, var_name: str, dataset_name: str) -> pd.DataFrame:
    """
    Convert a NetCDF file to a pandas DataFrame and print the percentage of NaN values for the specified variable.

    Args:
    file_path (str): Path to the .nc file.
    var_name (str): Name of the variable in the .nc file to convert.
    dataset_name (str): Descriptive name for the dataset.

    Returns:
    pd.DataFrame: DataFrame representation of the NetCDF variable.
    """
    # Open the .nc file
    ds = xr.open_dataset(file_path)

    # Select the variable
    data = ds[var_name]

    # Convert to DataFrame
    df = data.to_dataframe().reset_index()

    # Calculate and print the percentage of NaN values
    nan_percentage = df[var_name].isna().mean() * 100
    print(f"DataFrame from {file_path} - Variable: {var_name}")
    print(df.head())  # Display the first few rows of the DataFrame
    print(f"Percentage of NaN values in '{var_name}': {nan_percentage:.2f}%")
    print('-' * 50)  # Separator

    # Store the DataFrame in a global variable
    globals()[f'df_{dataset_name}'] = df

# File paths, variable names, and dataset names
file_var_pairs = [
    ('../../data/original_data/jay_files/Topography_Products/Curvature.nc', 'Curvatu_tif2', 'curvature'),
    ('../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc', 'Drainage_density', 'drainage_density'),
    ('../../data/original_data/jay_files/Topography_Products/Slope.nc', 'Slope_tif2', 'slope'),
    ('../../data/original_data/jay_files/hydrogeology.nc', 'Hydrogeo', 'Hydrogeology')
]

# Process each file
for file_path, var_name, dataset_name in file_var_pairs:
    nc_to_dataframe(file_path=file_path, var_name=var_name, dataset_name=dataset_name)

# After running this script, you can access each DataFrame directly, e.g., df_ndvi, df_ndwi, etc.

DataFrame from ../../data/original_data/jay_files/Topography_Products/Curvature.nc - Variable: Curvatu_tif2
         lat        lon  Curvatu_tif2
0  13.874722 -17.078611           NaN
1  13.874722 -17.078333           NaN
2  13.874722 -17.078056           NaN
3  13.874722 -17.077778           NaN
4  13.874722 -17.077500           NaN
Percentage of NaN values in 'Curvatu_tif2': 2.38%
--------------------------------------------------
DataFrame from ../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc - Variable: Drainage_density
         lat        lon  Drainage_density
0  13.874861 -16.806782               0.0
1  13.874861 -16.806504               0.0
2  13.874861 -16.806226               0.0
3  13.874861 -16.805948               0.0
4  13.874861 -16.805671               0.0
Percentage of NaN values in 'Drainage_density': 0.00%
--------------------------------------------------
DataFrame from ../../data/original_data/jay_files/Topography_Products/Slope.nc - Variab

In [16]:
df = pd.read_csv("../../data/final_dataset/original/gambia_lat_and_long.csv")
df

Unnamed: 0,long,lat
0,-16.670616,13.488346
1,-16.665998,13.488376
2,-16.684438,13.483734
3,-16.679820,13.483765
4,-16.675202,13.483796
...,...,...
2869,-16.746177,13.076569
2870,-16.741567,13.076600
2871,-16.736957,13.076631
2872,-16.732347,13.076662


In [6]:
df_drainage_density

Unnamed: 0,lat,lon,Drainage_density
0,13.874861,-16.806782,0.000000
1,13.874861,-16.806504,0.000000
2,13.874861,-16.806226,0.000000
3,13.874861,-16.805948,0.000000
4,13.874861,-16.805671,0.000000
...,...,...,...
31421996,13.077083,-13.770115,10.032833
31421997,13.077083,-13.769837,10.017525
31421998,13.077083,-13.769560,9.998237
31421999,13.077083,-13.769282,9.974904


In [21]:
from scipy.spatial import KDTree
import pandas as pd

def find_nearest_drainage_density(df_wells: pd.DataFrame, df_drainage_density: pd.DataFrame) -> pd.DataFrame:
    """
    Encuentra el vecino más cercano en df_drainage_density para cada punto en df_wells
    y combina los datos, incluyendo solamente la columna Drainage_density.
    """
    # Verifica que las columnas estén correctamente nombradas y alineadas
    assert 'lat' in df_wells and 'long' in df_wells, "Las columnas 'lat' y 'long' deben estar en df_wells"
    assert 'lat' in df_drainage_density and 'lon' in df_drainage_density, "Las columnas 'lat' y 'lon' deben estar en df_drainage_density"

    # Construir KDTree para búsqueda eficiente del vecino más cercano
    tree = KDTree(df_drainage_density[['lat', 'lon']])

    # Lista para almacenar las filas combinadas
    merged_rows = []

    # Iterar sobre cada fila en df_wells
    for index, well in df_wells.iterrows():
        print(f"Analizando fila {index + 1}/{len(df_wells)}")

        # Encontrar el vecino más cercano para esta fila
        distance, nearest_idx = tree.query([well['lat'], well['long']], k=1)
        nearest_row = df_drainage_density.iloc[nearest_idx]

        print(f"Coincidencia más cercana encontrada en el índice {nearest_idx}: {nearest_row['Drainage_density']}")

        # Combinar los datos, incluyendo solamente Drainage_density
        merged_row = well.to_dict()
        merged_row['Drainage_density'] = nearest_row['Drainage_density']
        merged_rows.append(merged_row)

        print(f"Datos combinados: {merged_row}\n")
        
    merged_df = pd.DataFrame(merged_rows)

    # Devolver el DataFrame resultante
    return merged_df



def save_data(merged_df: pd.DataFrame, save_path: str) -> None:
    """Save the merged data to a CSV file."""
    merged_df.to_csv(save_path, index=False)


In [22]:
merged_df = find_nearest_drainage_density(df_wells=df, df_drainage_density=df_drainage_density)
save_data(merged_df=merged_df, save_path='../../data/final_dataset/processed_data/jay_merged_data.csv')

Analizando fila 1/2874
Coincidencia más cercana encontrada en el índice 15213857: 13.499495506286621
Datos combinados: {'long': -16.6706159583, 'lat': 13.48834562, 'Drainage_density': 13.499495506286621}

Analizando fila 2/2874
Coincidencia más cercana encontrada en el índice 15213874: 14.573663711547852
Datos combinados: {'long': -16.6659979412, 'lat': 13.4883763217, 'Drainage_density': 14.573663711547852}

Analizando fila 3/2874
Coincidencia más cercana encontrada en el índice 15399736: 15.60971450805664
Datos combinados: {'long': -16.6844382698, 'lat': 13.4837337841, 'Drainage_density': 15.60971450805664}

Analizando fila 4/2874
Coincidencia más cercana encontrada en el índice 15399753: 13.571120262145996
Datos combinados: {'long': -16.6798203741, 'lat': 13.4837647298, 'Drainage_density': 13.571120262145996}

Analizando fila 5/2874
Coincidencia más cercana encontrada en el índice 15399770: 13.922872543334961
Datos combinados: {'long': -16.6752024668, 'lat': 13.4837955906, 'Drainage_