In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
drive_path = '/content/drive/MyDrive/Colab Notebooks/'

Cargamos dataset de medidas in-situ

In [None]:
#!ls /content/drive/MyDrive/Colab\ Notebooks/

In [None]:
folder_path = 'COPERNICUS_GBOV_RM7_20253103525/RM7/'

In [None]:
import glob
file_list = glob.glob(drive_path + folder_path + "*.csv")  # Esto busca todos los archivos que terminen en .csv

Primero repasamos el folder y abrimos cada csv para identificar los archivos que cumplen con las columnas requeridas.

In [None]:
# Lista de columnas esperadas
expected_columns = [
    'GBOV_ID', 'Site', 'GROUND_DATA_PI', 'GROUND_DATA_PIs_Email', 'GBOV_Email', 'Network',
    'Elevation', 'IGBP_class', 'Lat_IS', 'Lon_IS', 'TIME_IS', 'Version', 'up_flag', 'down_flag',
    'LAI_Miller_up', 'LAI_Warren_up', 'LAIe_Miller_up', 'LAIe_Warren_up', 'LAI_Miller_down',
    'LAI_Warren_down', 'LAIe_Miller_down', 'LAIe_Warren_down', 'LAI_Miller_up_err', 'LAI_Warren_up_err',
    'LAIe_Miller_up_err', 'LAIe_Warren_up_err', 'clumping_Miller_up', 'clumping_Warren_up',
    'LAI_Miller_down_err', 'LAI_Warren_down_err', 'LAIe_Miller_down_err', 'LAIe_Warren_down_err',
    'clumping_Miller_down', 'clumping_Warren_down', 'clumping_Miller_up_err', 'clumping_Warren_up_err',
    'clumping_Miller_down_err', 'clumping_Warren_down_err'
]

# Lista para guardar los nombres de los archivos que no cumplen
invalid_files = []

# Paso 1: Leer cada archivo CSV y verificar las columnas

for file in file_list:
    df_invidual = pd.read_csv(file, sep=';')

    # Verificar si las columnas son las esperadas
    missing_columns = [col for col in expected_columns if col not in df_invidual.columns]
    extra_columns = [col for col in df_invidual.columns if col not in expected_columns]

    # Si faltan columnas o hay columnas extra, agregamos el nombre del archivo a la lista
    if missing_columns or extra_columns:
        invalid_files.append(file)

# Paso 2: Mostrar los nombres de los archivos que no cumplen
print("Archivos que no cumplen con las columnas esperadas:")
invalid_files


Archivos que no cumplen con las columnas esperadas:


['/content/drive/MyDrive/Colab Notebooks/COPERNICUS_GBOV_RM7_20253103525/RM7/GBOV_RM7_HARV_HARV_011_20180814T110900Z_20180814T110900Z_021_ACR_1.0.csv',
 '/content/drive/MyDrive/Colab Notebooks/COPERNICUS_GBOV_RM7_20253103525/RM7/GBOV_RM7_HARV_HARV_024_20180814T203500Z_20180814T203500Z_021_ACR_1.0.csv']

In [None]:
valid_files_list = [archi for archi in file_list if archi not in invalid_files]

Creamos un unico dataframe a partir de todos los csv validos

In [None]:
df_juntos = []
for file in valid_files_list:
    df_invidual = pd.read_csv(file, sep=';')

    # Limpiar los nombres de las columnas
    df_invidual.columns = df_invidual.columns.str.strip()  # Elimina espacios extra en los nombres de las columnas
    if len(df_invidual.columns) != 38:
       print(len(df_invidual.columns))
       print(file)

    df_juntos.append(df_invidual)  # Agregar cada DataFrame a la lista

# Combinar todos los DataFrames en uno solo
df_insitu = pd.concat(df_juntos, axis=0, ignore_index=True)

# Verificar el DataFrame combinado
df_insitu.head()

Unnamed: 0,GBOV_ID,Site,GROUND_DATA_PI,GROUND_DATA_PIs_Email,GBOV_Email,Network,Elevation,IGBP_class,Lat_IS,Lon_IS,...,LAI_Miller_down_err,LAI_Warren_down_err,LAIe_Miller_down_err,LAIe_Warren_down_err,clumping_Miller_down,clumping_Warren_down,clumping_Miller_up_err,clumping_Warren_up_err,clumping_Miller_down_err,clumping_Warren_down_err
0,GBOV_RM7_1223,Harvard Forest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,351,Mixed Forest,42.5378,-72.171501,...,0.027,0.04,0.017,0.022,0.75,0.8,0.029,0.04,0.11,0.19
1,GBOV_RM7_1223,Harvard Forest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,351,Mixed Forest,42.5378,-72.171501,...,,,,,,,,,,
2,GBOV_RM7_1235,Harvard Forest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,351,Mixed Forest,42.5378,-72.171501,...,0.11,0.18,0.07,0.11,0.76,0.74,0.033,0.05,0.08,0.17
3,GBOV_RM7_1235,Harvard Forest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,351,Mixed Forest,42.5378,-72.171501,...,,,,,,,,,,
4,GBOV_RM7_1244,Harvard Forest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,351,Mixed Forest,42.5378,-72.171501,...,0.21,0.32,0.14,0.22,0.64,0.67,0.035,0.06,0.07,0.16


Hacemos inventario de los sites y coordenadas de los medidas de campo.

In [None]:
df_coord = df_insitu[['Site','Lat_IS', 'Lon_IS']].drop_duplicates()
coordenadas = tuple(zip(df_coord['Site'].values.tolist(), df_coord['Lat_IS'].values.tolist(), df_coord['Lon_IS'].values.tolist()))
coordenadas

(('Harvard Forest', 42.5377998352051, -72.171501159668),
 ('Jones Ecological Research Center', 31.1948394775391, -84.468777),
 ('Jornada', 32.5907, -106.84261),
 ('Konza Prairie Biological Station', 39.110446, -96.612935),
 ('Lajas Experimental Station', 18.02125, -67.0769),
 ('Lenoir Landing', 31.853861, -88.161181),
 ('Moab', 38.24836, -109.38831),
 ('Niwot Ridge Mountain Research Station', 40.0543, -105.58245),
 ('Onaqui Ault', 40.1775894165039, -112.452438354492),
 ('Oak Ridge', 35.9641189575195, -84.282600402832),
 ('Ordway Swisher Biological Station', 29.6839, -81.9934),
 ('Pu u Maka ala Natural Area Reserve', 19.55309, -155.31731),
 ('Smithsonian Conservation Biology Institute',
  38.8929214477539,
  -78.1395034790039),
 ('Smithsonian Environmental Research Center', 38.89016, -76.5601),
 ('Soaproot Saddle', 37.03337, -119.26219),
 ('Santa Rita', 31.91068, -110.83549),
 ('Steigerwaldt Land Services', 45.5089416503906, -89.5863723754883),
 ('North Sterling', 40.461952, -103.02934)

In [None]:
df_insitu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9406 entries, 0 to 9405
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   GBOV_ID                   9406 non-null   object 
 1   Site                      9406 non-null   object 
 2   GROUND_DATA_PI            9176 non-null   object 
 3   GROUND_DATA_PIs_Email     9176 non-null   object 
 4   GBOV_Email                9406 non-null   object 
 5   Network                   9406 non-null   object 
 6   Elevation                 9406 non-null   int64  
 7   IGBP_class                9406 non-null   object 
 8   Lat_IS                    9406 non-null   float64
 9   Lon_IS                    9406 non-null   float64
 10  TIME_IS                   9406 non-null   object 
 11  Version                   9406 non-null   float64
 12  up_flag                   7713 non-null   float64
 13  down_flag                 7713 non-null   float64
 14  LAI_Mill

Grabamos el csv del dataframe insitu crudo

In [None]:
#df_insitu_backup = df_insitu.copy()
#df_insitu.to_csv(drive_path + 'NEON_LAI_USA_2017_2022.csv')