# This file process a a folder with all the .csv files taked from AppEARS with data from MOD13Q1 (NDVI), pre-process them and save them. 

### First we make explicit the installation of libraries we will use and the importing.

In [1]:
!pip3 install -q --upgrade pip
!pip3 install -q pandas numpy matplotlib seaborn openpyxl climateserv requests netCDF4 xarray pyproj

In [2]:
import sys
from pathlib import Path

# Calculate the path to the scripts folder relative to the current notebook.
scripts_dir = Path("../../").resolve()

# Add the scripts directory to the sys.path if it's not already there.
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import pandas as pd
from scripts.preprocessing.appears_ndvi_pre_process import clean_and_save_ndvi_dataset, process_all_csv_in_directory, merge_additional_data

### We use the function declareds in the python file in this directory: 'scripts/preprocessing/appears_ndvi_pre_process.py'

### Here below we use the function process_all_csv_in_directory to clean the data, basically we the data that has clouds. The value 0b00000000000000000000000000000000 it means data without clouds.

In [4]:
directory_path = '../../data/original_data/appears/ndvi/'
output_directory = '../../data/processed_data/appears/ndvi'
process_all_csv_in_directory(directory_path=directory_path, output_directory=output_directory)

Processing file: ../../data/original_data/appears/ndvi/0000024001.csv
Percentage of rows with value '0b00000000000000000000000000000000': 70.81%
Number of rows after cleaning: 131
Data cleaning complete. Number of rows after cleaning and filling: 2923
Finished processing: ../../data/original_data/appears/ndvi/0000024001.csv
Processing file: ../../data/original_data/appears/ndvi/NWP00001.csv
Percentage of rows with value '0b00000000000000000000000000000000': 68.65%
Number of rows after cleaning: 127
Data cleaning complete. Number of rows after cleaning and filling: 2923
Finished processing: ../../data/original_data/appears/ndvi/NWP00001.csv
Processing file: ../../data/original_data/appears/ndvi/0000000001.csv
Percentage of rows with value '0b00000000000000000000000000000000': 68.65%
Number of rows after cleaning: 127
Data cleaning complete. Number of rows after cleaning and filling: 2923
Finished processing: ../../data/original_data/appears/ndvi/0000000001.csv
Processing file: ../../dat

### Then we merged with the master dataset containing the information of the differents wells and the groundwater level over the time (between 2015 and 2022). We delete all the rows that has same well and same day. Reducing the dataset from 25406 rows to 6636. Basically some wells take information in the same day more than 1 time (2,3,4) and for this project we are using a daily time-space.

In [4]:
main_df_path = '../../data/processed_data/merged_data_gambia.csv'
additional_data_dir = '../../data/processed_data/appears/ndvi/'
merged_df = merge_additional_data(main_df_path, additional_data_dir)

Initial number of rows in main dataframe: 25406
Number of rows in final dataframe: 6636


In [15]:
# Specify the columns you want to drop
columns_to_drop = ['Latitude_y', 'Longitude_y']

# Use the drop() method to remove the specified columns
merged_df = merged_df.drop(columns=columns_to_drop)

In [17]:
# Create a dictionary to specify the column name changes
column_name_changes = {
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude'
}

# Use the rename() method to rename the specified columns
merged_df = merged_df.rename(columns=column_name_changes)


In [18]:
merged_df

Unnamed: 0,ID,GROUNDWATER_LEVEL,Day,Month,Year,Latitude,Longitude,Ground surface elevation,LIS_Soil_Moisture_Combined,LIS_Streamflow,LIS_ET,Date,MOD13Q1_061__250m_16_days_EVI,MOD13Q1_061__250m_16_days_MIR_reflectance,MOD13Q1_061__250m_16_days_NDVI
0,NWP00001,14.26,14,4,2016,13.420804,-16.716666,19.47,0.172124,0.059371,0.219499,2016-04-14,0.2113,0.3972,0.2564
1,NWP00001,13.82,26,12,2015,13.420804,-16.716666,19.47,0.210837,0.136391,1.412334,2015-12-26,0.2443,0.1787,0.4064
3,NWP00001,13.82,25,12,2015,13.420804,-16.716666,19.47,0.211614,0.138738,1.612549,2015-12-25,0.2443,0.1787,0.4064
7,NWP00001,13.83,24,12,2015,13.420804,-16.716666,19.47,0.212448,0.141231,1.522625,2015-12-24,0.2443,0.1787,0.4064
11,NWP00001,13.83,23,12,2015,13.420804,-16.716666,19.47,0.213260,0.143826,1.481390,2015-12-23,0.2443,0.1787,0.4064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25387,NWP21002,16.85,5,1,2015,13.240281,-16.713889,21.68,0.232334,0.085506,1.118654,2015-01-05,0.1795,0.2054,0.3419
25391,NWP21002,16.79,4,1,2015,13.240281,-16.713889,21.68,0.232903,0.086226,1.095831,2015-01-04,0.1795,0.2054,0.3419
25395,NWP21002,16.82,3,1,2015,13.240281,-16.713889,21.68,0.233475,0.086970,1.068498,2015-01-03,0.1795,0.2054,0.3419
25399,NWP21002,16.82,2,1,2015,13.240281,-16.713889,21.68,0.234051,0.087731,1.069310,2015-01-02,0.1795,0.2054,0.3419


In [19]:
csv_path = '../../data/processed_data/final_data_gambia.csv'
merged_df.to_csv(csv_path, index=False)  # Set index to False if you don't want to save the DataFrame index

### Finally we have merged into our dataset 3 variables coming from AppEARS: NDVI, EVI and MIR reflectance. 