In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj

In [2]:
import os
import netCDF4 as nc
import numpy as np
import xarray as xr
import pandas as pd

In [3]:
def analyze_nasadem_file(file_path: str) -> None:
    """
    Analyze a NASADEM NetCDF file and print out its contents, such as variable names,
    dimensions, and min/max values for longitude and latitude.

    Args:
    file_path (str): The path to the NASADEM NetCDF file.
    """
    # Open the NetCDF file
    dataset = nc.Dataset(file_path, 'r')

    # Print general information about the file
    print('##################################################')
    print(f"Analyzing file: {file_path}")
    print("Variables in this file:")
    for var in dataset.variables:
        print(f" - {var}: {dataset.variables[var].dimensions}, {dataset.variables[var].shape}")

    # Check for common variables like longitude and latitude
    if 'lon' in dataset.variables and 'lat' in dataset.variables:
        lon = dataset.variables['lon'][:]
        lat = dataset.variables['lat'][:]
        print(f"Longitude range: {np.min(lon)} to {np.max(lon)}")
        print(f"Latitude range: {np.min(lat)} to {np.max(lat)}")
    
    # Close the dataset
    dataset.close()

# Paths to your files
file_paths = [
    '../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDMI.nc',
    '../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDVI.nc',
    '../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDWI.nc',
    '../../data/original_data/jay_files/Topography_Products/Curvature.nc',
    '../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc',
    '../../data/original_data/jay_files/Topography_Products/Slope.nc',
]

# Analyze each file
for path in file_paths:
    analyze_nasadem_file(file_path=path)

##################################################
Analyzing file: ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDMI.nc
Variables in this file:
 - lon: ('lon',), (12007,)
 - lat: ('lat',), (2955,)
 - MNDWI_GAMBIA_landsat.tif: ('lat', 'lon'), (2955, 12007)
Longitude range: -16.96940029570738 to -13.733848305361875
Latitude range: 13.028850470919723 to 13.824937475707381
##################################################
Analyzing file: ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDVI.nc
Variables in this file:
 - lon: ('lon',), (12007,)
 - lat: ('lat',), (2955,)
 - NDVI.tif: ('lat', 'lon'), (2955, 12007)
Longitude range: -16.96940029570738 to -13.733848305361875
Latitude range: 13.028850470919723 to 13.824937475707381
##################################################
Analyzing file: ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDWI.nc
Variables in this file:
 - lon: ('lon',), (12007,)
 - lat: ('lat',), (2955,)
 - NDWI.tif: ('lat', 'lon'), (2955, 12007)
Longitude range: 

In [4]:
import xarray as xr
import pandas as pd

def nc_to_dataframe(file_path: str, var_name: str, dataset_name: str) -> pd.DataFrame:
    """
    Convert a NetCDF file to a pandas DataFrame and print the percentage of NaN values for the specified variable.

    Args:
    file_path (str): Path to the .nc file.
    var_name (str): Name of the variable in the .nc file to convert.
    dataset_name (str): Descriptive name for the dataset.

    Returns:
    pd.DataFrame: DataFrame representation of the NetCDF variable.
    """
    # Open the .nc file
    ds = xr.open_dataset(file_path)

    # Select the variable
    data = ds[var_name]

    # Convert to DataFrame
    df = data.to_dataframe().reset_index()

    # Calculate and print the percentage of NaN values
    nan_percentage = df[var_name].isna().mean() * 100
    print(f"DataFrame from {file_path} - Variable: {var_name}")
    print(df.head())  # Display the first few rows of the DataFrame
    print(f"Percentage of NaN values in '{var_name}': {nan_percentage:.2f}%")
    print('-' * 50)  # Separator

    # Store the DataFrame in a global variable
    globals()[f'df_{dataset_name}'] = df

# File paths, variable names, and dataset names
file_var_pairs = [
    ('../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDMI.nc', 'MNDWI_GAMBIA_landsat.tif', 'ndmi'),
    ('../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDVI.nc', 'NDVI.tif', 'ndvi'),
    ('../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDWI.nc', 'NDWI.tif', 'ndwi'),
    ('../../data/original_data/jay_files/Topography_Products/Curvature.nc', 'Curvatu_tif2', 'curvature'),
    ('../../data/original_data/jay_files/Topography_Products/Drainage_Density.nc', 'Drainage_density', 'drainage_density'),
    ('../../data/original_data/jay_files/Topography_Products/Slope.nc', 'Slope_tif2', 'slope')
]

# Process each file
for file_path, var_name, dataset_name in file_var_pairs:
    nc_to_dataframe(file_path=file_path, var_name=var_name, dataset_name=dataset_name)

# After running this script, you can access each DataFrame directly, e.g., df_ndvi, df_ndwi, etc.

DataFrame from ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDMI.nc - Variable: MNDWI_GAMBIA_landsat.tif
         lat        lon  MNDWI_GAMBIA_landsat.tif
0  13.824937 -16.969400                  0.437477
1  13.824937 -16.969131                  0.439944
2  13.824937 -16.968861                  0.435235
3  13.824937 -16.968592                  0.461824
4  13.824937 -16.968322                  0.454524
Percentage of NaN values in 'MNDWI_GAMBIA_landsat.tif': 0.00%
--------------------------------------------------
DataFrame from ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDVI.nc - Variable: NDVI.tif
         lat        lon  NDVI.tif
0  13.824937 -16.969400 -0.128294
1  13.824937 -16.969131 -0.138499
2  13.824937 -16.968861 -0.126083
3  13.824937 -16.968592 -0.126840
4  13.824937 -16.968322 -0.128421
Percentage of NaN values in 'NDVI.tif': 0.00%
--------------------------------------------------
DataFrame from ../../data/original_data/jay_files/NDVI_NDWI_NDMI/NDWI.nc - Variabl

In [5]:
df_ndvi

Unnamed: 0,lat,lon,NDVI.tif
0,13.824937,-16.969400,-0.128294
1,13.824937,-16.969131,-0.138499
2,13.824937,-16.968861,-0.126083
3,13.824937,-16.968592,-0.126840
4,13.824937,-16.968322,-0.128421
...,...,...,...
35480680,13.028850,-13.734926,0.395745
35480681,13.028850,-13.734657,0.367785
35480682,13.028850,-13.734387,0.337395
35480683,13.028850,-13.734118,0.333245


In [33]:
df_ndwi

Unnamed: 0,lat,lon,NDWI.tif
0,13.824935,-16.969405,0.177121
1,13.824935,-16.969136,0.179183
2,13.824935,-16.968866,0.180220
3,13.824935,-16.968597,0.220782
4,13.824935,-16.968327,0.204527
...,...,...,...
35480680,13.028855,-13.734923,0.026303
35480681,13.028855,-13.734653,0.005021
35480682,13.028855,-13.734384,-0.019881
35480683,13.028855,-13.734114,-0.020817


In [34]:
df_ndmi

Unnamed: 0,lat,lon,MNDWI_GAMBIA_landsat.tif
0,13.824937,-16.969400,0.437477
1,13.824937,-16.969131,0.439944
2,13.824937,-16.968861,0.435235
3,13.824937,-16.968592,0.461824
4,13.824937,-16.968322,0.454524
...,...,...,...
35480680,13.028850,-13.734926,-0.368186
35480681,13.028850,-13.734657,-0.368732
35480682,13.028850,-13.734387,-0.366731
35480683,13.028850,-13.734118,-0.369645


In [35]:
df_curvature

Unnamed: 0,lat,lon,Curvatu_tif2
0,13.874722,-17.078611,
1,13.874722,-17.078333,
2,13.874722,-17.078056,
3,13.874722,-17.077778,
4,13.874722,-17.077500,
...,...,...,...
34219875,13.077222,-13.770278,3.888000e+09
34219876,13.077222,-13.770000,-1.296000e+09
34219877,13.077222,-13.769722,1.296000e+09
34219878,13.077222,-13.769444,-2.592000e+09


In [36]:
df_drainage_density

Unnamed: 0,lat,lon,Drainage_density
0,13.874861,-16.806782,0.000000
1,13.874861,-16.806504,0.000000
2,13.874861,-16.806226,0.000000
3,13.874861,-16.805948,0.000000
4,13.874861,-16.805671,0.000000
...,...,...,...
31421996,13.077083,-13.770115,10.032833
31421997,13.077083,-13.769837,10.017525
31421998,13.077083,-13.769560,9.998237
31421999,13.077083,-13.769282,9.974904


In [39]:
print(df.columns)


Index(['lat', 'lon', 'Slope_tif2'], dtype='object')


In [43]:
df_ndmi

Unnamed: 0,lat,lon,MNDWI_GAMBIA_landsat.tif
0,13.824937,-16.969400,0.437477
1,13.824937,-16.969131,0.439944
2,13.824937,-16.968861,0.435235
3,13.824937,-16.968592,0.461824
4,13.824937,-16.968322,0.454524
...,...,...,...
35480680,13.028850,-13.734926,-0.368186
35480681,13.028850,-13.734657,-0.368732
35480682,13.028850,-13.734387,-0.366731
35480683,13.028850,-13.734118,-0.369645


In [None]:
import pandas as pd
import numpy as np

def calculate_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calculate the Euclidean distance between two latitude-longitude points.
    """
    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

def find_closest_value(df: pd.DataFrame, lat: float, lon: float, var_name: str):
    """
    Find the value of var_name in df closest to the specified latitude and longitude.
    """
    distances = df.apply(lambda row: calculate_distance(lat, lon, row['lat'], row['lon']), axis=1)
    closest_index = distances.idxmin()
    return df.at[closest_index, var_name]

# Read the wells data
wells_df = pd.read_csv('../../data/processed_data/igrac/wells_gambia.csv')

# DataFrames from .nc files (assuming they are already created and stored in a dictionary)
nc_dataframes = {
    'MNDWI_GAMBIA_landsat.tif': df_ndmi,  # Replace mndwi_df with the actual DataFrame
    'NDVI.tif': df_ndvi,
    'NDWI.tif': df_ndwi,
    'Curvatu_tif2': df_curvature,
    'Drainage_density': df_drainage_density,
    'Slope_tif2': df_slope
}

# Adding new columns to the wells DataFrame
for var_name, nc_df in nc_dataframes.items():
    wells_df[var_name] = wells_df.apply(lambda row: find_closest_value(nc_df, row['Latitude'], row['Longitude'], var_name), axis=1)

print(wells_df)


In [None]:
def save_dataframe_to_csv(df: pd.DataFrame, csv_path: str) -> None:
    """
    Save a Pandas DataFrame to a CSV file.

    Args:
    df (pd.DataFrame): The DataFrame to save.
    csv_path (str): The path where the CSV file will be saved.
    """
    df.to_csv(csv_path, index=False)
    print(f"DataFrame saved to {csv_path}")


csv_path = '../../data/processed_data/igrac/weells_with_jay_info.csv'

    # Save the DataFrame to CSV
save_dataframe_to_csv(df=wells_df, csv_path=csv_path)