In [2]:
import pandas as pd
from modules import preprocess

# Enable autoreload for Jupyter notebooks
%load_ext autoreload
%autoreload 2


### 1. Define study crop and country

In [4]:
# USER INPUTS
country = "US" # one of ["US", "BR"]
crop = "wheat" # one of ["maize", "wheat"]

shapefile_path, crop_season_in_days_of_year, crop_season_in_months, _ = preprocess.get_study_metadata(country, crop)

print(shapefile_path, crop, country, crop_season_in_days_of_year, crop_season_in_months)

../data/shapefiles/US/tl_2023_us_county/tl_2023_us_county.shp wheat US (33, 233) (2, 8)


### 2. Read data

From CY-Bench, we have five predictor datasets. 

| **ID** | **Name**        | **Time** | **Variables**                         | **Steps**                                    | **Notebook**   |
|--------|-----------------|----------|---------------------------------------|----------------------------------------------|----------------|
| 1      | FPAR            | bins     | fpar                                  | filter adm_ids, filter crop season           | _this notebook_ |            
| 2      | METEO           | daily    | tmin, tmax, prec, rad, tavg, et0, cwb | filter adm_ids, filter crop season,<br>resample to 8-day bins  | _3_ecmwf_preprocessing.ipynb (tmin, tmax, tavg, prec_) <br>_this notebook_ (et0, rad, cwb) |       
| 3      | NDVI            | bins     | ndvi                                  | filter adm_ids, filter crop season, pivotieren           | _this notebook_ |           
| 4      | SOIL MOISTURE   | daily    | ssm, rsm                              | filter adm_ids, filter crop season, <br>resample to 8-day bins, pivotieren  | _this notebook_ |          
| 5      | SOIL            | static   | awc, bulk_density, drainage_class     | filter adm_ids, pivotieren                                | _this notebook_ |         


In [121]:
# CY-Bench data
fpar = pd.read_csv("../data/CY-Bench/{}/{}/fpar_{}_{}.csv".format(country, crop, crop, country)) 
meteo = pd.read_csv("../data/CY-Bench/{}/{}/meteo_{}_{}.csv".format(country, crop, crop, country), usecols=['crop_name','adm_id','date','rad','et0','cwb'])
ndvi = pd.read_csv("../data/CY-Bench/{}/{}/ndvi_{}_{}.csv".format(country, crop, crop, country))
soil_moisture = pd.read_csv("../data/CY-Bench/{}/{}/soil_moisture_{}_{}.csv".format(country, crop, crop, country))
soil = pd.read_csv("../data/CY-Bench/{}/{}/soil_{}_{}.csv".format(country, crop, crop, country))
yield_data = pd.read_csv("../data/CY-Bench/{}/{}/yield_{}_{}.csv".format(country, crop, crop, country))

### 3. Preprocess

In [122]:
cy_bench_data = [fpar, soil_moisture, ndvi, soil, meteo]
relevant_adm_ids = yield_data["adm_id"].unique()
cy_bench_data = preprocess.filter_predictors_by_adm_ids(cy_bench_data, relevant_adm_ids)
fpar, soil_moisture, ndvi, soil, meteo = cy_bench_data

In [125]:
temporal_cy_bench_data = [fpar, soil_moisture, ndvi, meteo]
temporal_cy_bench_data = preprocess.preprocess_temporal_data(temporal_cy_bench_data, crop_season_in_days_of_year)

fpar, soil_moisture, ndvi, meteo = temporal_cy_bench_data

In [126]:
# merge
predictors = (soil_moisture
              .merge(ndvi
                     .merge(meteo
                            .merge(fpar, on=["adm_id", "harvest_year"], how="inner"), 
                            on=["adm_id", "harvest_year"], how="inner"), 
                     on=["adm_id", "harvest_year"], how="inner")
              .merge(soil.drop("crop_name", axis=1), on="adm_id", how="left"))

# one-hot encode drainage_class and drop
predictors = (predictors
              .join(pd.get_dummies(predictors["drainage_class"].astype('Int64'), dtype=int, prefix="drainage_class"))
              .drop("drainage_class", axis=1))

In [128]:
predictors.to_csv("../data/preprocessed/{}/ndvi_soil_soil_moisture_meteo_fpar_{}_{}.csv".format(country, crop, country), index=False)