In [12]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj python-dotenv

In [23]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import os
import time
import json
import requests
import numpy as np
import xarray as xr
import pandas as pd
import netCDF4 as nc
import geopandas as gpd
from dotenv import load_dotenv
from scipy.spatial import KDTree
from shapely.geometry import Point
from typing import Dict, List, Tuple, Optional, Any
# Load environment variables from .env file
load_dotenv()

True

In [24]:
def analyze_nasadem_file(file_path: str) -> None:
    """
    Analyze a NASADEM NetCDF file and print out its contents, such as variable names,
    dimensions, and min/max values for longitude and latitude.

    Args:
    file_path (str): The path to the NASADEM NetCDF file.
    """
    # Open the NetCDF file
    dataset = nc.Dataset(file_path, 'r')

    # Print general information about the file
    print('##################################################')
    print(f"Analyzing file: {file_path}")
    print("Variables in this file:")
    for var in dataset.variables:
        print(f" - {var}: {dataset.variables[var].dimensions}, {dataset.variables[var].shape}")

    # Check for common variables like longitude and latitude
    if 'lon' in dataset.variables and 'lat' in dataset.variables:
        lon = dataset.variables['lon'][:]
        lat = dataset.variables['lat'][:]
        print(f"Longitude range: {np.min(lon)} to {np.max(lon)}")
        print(f"Latitude range: {np.min(lat)} to {np.max(lat)}")
    
    # Close the dataset
    dataset.close()

# Paths to your files
file_paths = [
    '../../data/final_dataset/original/MOD13Q1.061_250m_aid0001.nc'
]

# Analyze each file
for path in file_paths:
    analyze_nasadem_file(file_path=path)

##################################################
Analyzing file: ../../data/final_dataset/original/MOD13Q1.061_250m_aid0001.nc
Variables in this file:
 - crs: (), ()
 - time: ('time',), (2,)
 - ydim: ('ydim',), (294,)
 - xdim: ('xdim',), (744,)
 - _250m_16_days_EVI: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_MIR_reflectance: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_NDVI: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_NIR_reflectance: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_VI_Quality: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_composite_day_of_the_year: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_pixel_reliability: ('time', 'ydim', 'xdim'), (2, 294, 744)
 - _250m_16_days_red_reflectance: ('time', 'ydim', 'xdim'), (2, 294, 744)


In [25]:
import xarray as xr
import pandas as pd

def convert_nc_to_dataframe(file_path: str) -> pd.DataFrame:
    """
    Convert a NetCDF file to a Pandas DataFrame.

    Args:
        file_path (str): Path to the NetCDF file.

    Returns:
        pd.DataFrame: DataFrame containing data from the NetCDF file.
    """
    # Cargar el archivo .nc usando xarray
    ds = xr.open_dataset(file_path)

    # Convertir a DataFrame
    # Aquí usamos .to_dataframe().reset_index() para convertir a un formato de tabla larga
    df = ds.to_dataframe().reset_index()

    return df

# Ruta al archivo .nc
file_path = '../../data/final_dataset/original/MOD13Q1.061_250m_aid0001.nc'

In [29]:
# Convertir a DataFrame
df = convert_nc_to_dataframe(file_path=file_path)

# Mostrar las primeras filas del DataFrame
df
percentage_nan = df.isna().mean() * 100
percentage_nan


time                                        0.000000
ydim                                        0.000000
xdim                                        0.000000
crs                                         0.000000
_250m_16_days_EVI                          52.205855
_250m_16_days_MIR_reflectance              52.204941
_250m_16_days_NDVI                         52.519247
_250m_16_days_NIR_reflectance              52.205855
_250m_16_days_VI_Quality                   40.640315
_250m_16_days_composite_day_of_the_year    52.203798
_250m_16_days_pixel_reliability            52.519247
_250m_16_days_red_reflectance              52.204484
dtype: float64

In [30]:
df.dropna(inplace=True)

In [31]:
percentage_nan = df.isna().mean() * 100
percentage_nan

time                                       0.0
ydim                                       0.0
xdim                                       0.0
crs                                        0.0
_250m_16_days_EVI                          0.0
_250m_16_days_MIR_reflectance              0.0
_250m_16_days_NDVI                         0.0
_250m_16_days_NIR_reflectance              0.0
_250m_16_days_VI_Quality                   0.0
_250m_16_days_composite_day_of_the_year    0.0
_250m_16_days_pixel_reliability            0.0
_250m_16_days_red_reflectance              0.0
dtype: float64

In [32]:
df

Unnamed: 0,time,ydim,xdim,crs,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance,_250m_16_days_VI_Quality,_250m_16_days_composite_day_of_the_year,_250m_16_days_pixel_reliability,_250m_16_days_red_reflectance
194,2023-11-17 00:00:00,1.513527e+06,-1.789661e+06,-127,0.3257,0.0285,0.8015,0.1897,4164.0,331.0,0.0,0.0209
938,2023-11-17 00:00:00,1.513295e+06,-1.789661e+06,-127,0.2549,0.0285,0.7009,0.1627,4164.0,331.0,0.0,0.0286
939,2023-11-17 00:00:00,1.513295e+06,-1.789430e+06,-127,0.3017,0.0285,0.7627,0.1828,4164.0,331.0,0.0,0.0246
940,2023-11-17 00:00:00,1.513295e+06,-1.789198e+06,-127,0.2998,0.0415,0.7746,0.1803,4164.0,331.0,0.0,0.0229
941,2023-11-17 00:00:00,1.513295e+06,-1.788966e+06,-127,0.2818,0.0415,0.6967,0.1908,4164.0,331.0,0.0,0.0341
...,...,...,...,...,...,...,...,...,...,...,...,...
415244,2023-12-03 00:00:00,1.452370e+06,-1.813290e+06,-127,0.4007,0.0828,0.7331,0.2527,4164.0,345.0,0.0,0.0389
415245,2023-12-03 00:00:00,1.452370e+06,-1.813058e+06,-127,0.3688,0.0828,0.6347,0.2900,4164.0,345.0,0.0,0.0648
415246,2023-12-03 00:00:00,1.452370e+06,-1.812827e+06,-127,0.3175,0.0787,0.6362,0.2110,4164.0,338.0,0.0,0.0469
415988,2023-12-03 00:00:00,1.452138e+06,-1.813290e+06,-127,0.3645,0.0828,0.7855,0.1965,4164.0,345.0,0.0,0.0236


In [33]:
import pandas as pd
from pyproj import Proj, Transformer

def convert_modis_to_latlon(df: pd.DataFrame, x_column: str, y_column: str) -> pd.DataFrame:
    """
    Convert MODIS sinusoidal projection coordinates to latitude and longitude.

    Args:
        df (pd.DataFrame): DataFrame containing the MODIS coordinates.
        x_column (str): Name of the column containing x coordinates.
        y_column (str): Name of the column containing y coordinates.

    Returns:
        pd.DataFrame: DataFrame with added latitude and longitude columns.
    """
    # Proyección Sinusoidal MODIS
    modis_proj = Proj('+proj=sinu +R=6371007.181 +nadgrids=@null +wktext')

    # Proyección WGS84
    wgs84_proj = Proj('epsg:4326')

    # Crear un transformador
    transformer = Transformer.from_proj(modis_proj, wgs84_proj)

    # Aplicar la transformación
    df['latitude'], df['longitude'] = zip(*df.apply(lambda row: transformer.transform(row[x_column], row[y_column]), axis=1))

    return df


In [None]:
# Suponiendo que tienes un DataFrame llamado df con columnas 'xdim' y 'ydim'
df_converted = convert_modis_to_latlon(df=df, x_column='xdim', y_column='ydim')


# Delete 'ydim', 'xdim', and 'crs' columns
df_converted.drop(['ydim', 'xdim', 'crs'], axis=1, inplace=True)

# Move 'latitude' and 'longitude' columns to the first and second positions
df_converted = df_converted[['latitude', 'longitude'] + [col for col in df_converted.columns if col not in ['latitude', 'longitude']]]

# Mostrar las primeras filas del DataFrame
df_converted


In [38]:
df_converted

Unnamed: 0,latitude,longitude,time,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance,_250m_16_days_VI_Quality,_250m_16_days_composite_day_of_the_year,_250m_16_days_pixel_reliability,_250m_16_days_red_reflectance
194,13.611458,-16.559894,2023-11-17 00:00:00,0.3257,0.0285,0.8015,0.1897,4164.0,331.0,0.0,0.0209
938,13.609375,-16.559748,2023-11-17 00:00:00,0.2549,0.0285,0.7009,0.1627,4164.0,331.0,0.0,0.0286
939,13.609375,-16.557604,2023-11-17 00:00:00,0.3017,0.0285,0.7627,0.1828,4164.0,331.0,0.0,0.0246
940,13.609375,-16.555461,2023-11-17 00:00:00,0.2998,0.0415,0.7746,0.1803,4164.0,331.0,0.0,0.0229
941,13.609375,-16.553317,2023-11-17 00:00:00,0.2818,0.0415,0.6967,0.1908,4164.0,331.0,0.0,0.0341
...,...,...,...,...,...,...,...,...,...,...,...
415244,13.061458,-16.740396,2023-12-03 00:00:00,0.4007,0.0828,0.7331,0.2527,4164.0,345.0,0.0,0.0389
415245,13.061458,-16.738257,2023-12-03 00:00:00,0.3688,0.0828,0.6347,0.2900,4164.0,345.0,0.0,0.0648
415246,13.061458,-16.736118,2023-12-03 00:00:00,0.3175,0.0787,0.6362,0.2110,4164.0,338.0,0.0,0.0469
415988,13.059375,-16.740254,2023-12-03 00:00:00,0.3645,0.0828,0.7855,0.1965,4164.0,345.0,0.0,0.0236


In [39]:
# Delete 'ydim', 'xdim', and 'crs' columns
df_converted.drop(['_250m_16_days_red_reflectance', '_250m_16_days_composite_day_of_the_year'], axis=1, inplace=True)

# Show the first rows of the DataFrame
df_converted.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted.drop(['_250m_16_days_red_reflectance', '_250m_16_days_composite_day_of_the_year'], axis=1, inplace=True)


Unnamed: 0,latitude,longitude,time,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance,_250m_16_days_VI_Quality,_250m_16_days_pixel_reliability
194,13.611458,-16.559894,2023-11-17 00:00:00,0.3257,0.0285,0.8015,0.1897,4164.0,0.0
938,13.609375,-16.559748,2023-11-17 00:00:00,0.2549,0.0285,0.7009,0.1627,4164.0,0.0
939,13.609375,-16.557604,2023-11-17 00:00:00,0.3017,0.0285,0.7627,0.1828,4164.0,0.0
940,13.609375,-16.555461,2023-11-17 00:00:00,0.2998,0.0415,0.7746,0.1803,4164.0,0.0
941,13.609375,-16.553317,2023-11-17 00:00:00,0.2818,0.0415,0.6967,0.1908,4164.0,0.0


In [40]:
df_converted

Unnamed: 0,latitude,longitude,time,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance,_250m_16_days_VI_Quality,_250m_16_days_pixel_reliability
194,13.611458,-16.559894,2023-11-17 00:00:00,0.3257,0.0285,0.8015,0.1897,4164.0,0.0
938,13.609375,-16.559748,2023-11-17 00:00:00,0.2549,0.0285,0.7009,0.1627,4164.0,0.0
939,13.609375,-16.557604,2023-11-17 00:00:00,0.3017,0.0285,0.7627,0.1828,4164.0,0.0
940,13.609375,-16.555461,2023-11-17 00:00:00,0.2998,0.0415,0.7746,0.1803,4164.0,0.0
941,13.609375,-16.553317,2023-11-17 00:00:00,0.2818,0.0415,0.6967,0.1908,4164.0,0.0
...,...,...,...,...,...,...,...,...,...
415244,13.061458,-16.740396,2023-12-03 00:00:00,0.4007,0.0828,0.7331,0.2527,4164.0,0.0
415245,13.061458,-16.738257,2023-12-03 00:00:00,0.3688,0.0828,0.6347,0.2900,4164.0,0.0
415246,13.061458,-16.736118,2023-12-03 00:00:00,0.3175,0.0787,0.6362,0.2110,4164.0,0.0
415988,13.059375,-16.740254,2023-12-03 00:00:00,0.3645,0.0828,0.7855,0.1965,4164.0,0.0


In [45]:
print(df_converted['_250m_16_days_pixel_reliability'].value_counts())

df_converted = df_converted[df_converted['_250m_16_days_pixel_reliability'] == 0.0]

print(df_converted['_250m_16_days_pixel_reliability'].value_counts())

_250m_16_days_pixel_reliability
0.0    206617
1.0      1030
3.0        66
Name: count, dtype: int64
_250m_16_days_pixel_reliability
0.0    206617
Name: count, dtype: int64


In [47]:
df_converted.drop(['_250m_16_days_VI_Quality', '_250m_16_days_pixel_reliability'], axis=1, inplace=True)
df_converted


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted.drop(['_250m_16_days_VI_Quality', '_250m_16_days_pixel_reliability'], axis=1, inplace=True)


Unnamed: 0,latitude,longitude,time,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance
194,13.611458,-16.559894,2023-11-17 00:00:00,0.3257,0.0285,0.8015,0.1897
938,13.609375,-16.559748,2023-11-17 00:00:00,0.2549,0.0285,0.7009,0.1627
939,13.609375,-16.557604,2023-11-17 00:00:00,0.3017,0.0285,0.7627,0.1828
940,13.609375,-16.555461,2023-11-17 00:00:00,0.2998,0.0415,0.7746,0.1803
941,13.609375,-16.553317,2023-11-17 00:00:00,0.2818,0.0415,0.6967,0.1908
...,...,...,...,...,...,...,...
415244,13.061458,-16.740396,2023-12-03 00:00:00,0.4007,0.0828,0.7331,0.2527
415245,13.061458,-16.738257,2023-12-03 00:00:00,0.3688,0.0828,0.6347,0.2900
415246,13.061458,-16.736118,2023-12-03 00:00:00,0.3175,0.0787,0.6362,0.2110
415988,13.059375,-16.740254,2023-12-03 00:00:00,0.3645,0.0828,0.7855,0.1965


In [49]:
total_rows = df_converted.shape[0]
print("Total number of rows:", total_rows)

# Mantener la primera ocurrencia y descartar las siguientes
df_converted = df_converted.drop_duplicates(subset=['latitude', 'longitude'])

total_rows = df_converted.shape[0]
print("Total number of rows:", total_rows)

Total number of rows: 206617
Total number of rows: 104669


In [51]:
df_converted.drop(['time'], axis=1, inplace=True)
df_converted

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted.drop(['time'], axis=1, inplace=True)


Unnamed: 0,latitude,longitude,_250m_16_days_EVI,_250m_16_days_MIR_reflectance,_250m_16_days_NDVI,_250m_16_days_NIR_reflectance
194,13.611458,-16.559894,0.3257,0.0285,0.8015,0.1897
938,13.609375,-16.559748,0.2549,0.0285,0.7009,0.1627
939,13.609375,-16.557604,0.3017,0.0285,0.7627,0.1828
940,13.609375,-16.555461,0.2998,0.0415,0.7746,0.1803
941,13.609375,-16.553317,0.2818,0.0415,0.6967,0.1908
...,...,...,...,...,...,...
410777,13.073958,-16.747660,0.3321,0.1650,0.5587,0.2529
411518,13.071875,-16.753935,0.0344,0.1650,0.1055,0.0854
411519,13.071875,-16.751796,0.0344,0.1650,0.1055,0.0854
411520,13.071875,-16.749657,0.2304,0.1650,0.4039,0.2530


In [None]:
df_converted.to_csv('../../data/final_dataset/processed_data/appears/appears_ndvi_mir_evi.csv', index=False)
