In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from modules import preprocess

# Enable autoreload for Jupyter notebooks
%load_ext autoreload
%autoreload 2


### 1. Define study crop and country

In [2]:
# USER INPUTS
country = "BR" # one of ["US", "BR"]
crop = "wheat" # one of ["maize", "wheat"]

_, crop_season_in_days_of_year, crop_season_in_months, crop_season_in_time_steps, _ = preprocess.set_crop_season(country, crop)

print(crop, country, crop_season_in_days_of_year, crop_season_in_months, crop_season_in_time_steps)

wheat BR (129, 329) (5, 11) (16, 41)


### 2. Read data

From CY-Bench, we have five predictor datasets. 

| **ID** | **Name**        | **Time** | **Variables**                         | **Steps**                                    | **Notebook**   |
|--------|-----------------|----------|---------------------------------------|----------------------------------------------|----------------|
| 1      | FPAR            | bins     | fpar                                  | filter adm_ids, filter crop season           | _on hold because of 10-day bins_ |            
| 2      | METEO           | daily    | tmin, tmax, prec, rad, tavg, et0, cwb | filter adm_ids, filter crop season,<br>resample to 8-day bins  |_3_ecmwf_preprocessing.ipynb_ |       
| 3      | NDVI            | bins     | ndvi                                  | filter adm_ids, filter crop season, pivotieren           | _this notebook_ |           
| 4      | SOIL MOISTURE   | daily    | ssm, rsm                              | filter adm_ids, filter crop season, <br>resample to 8-day bins, pivotieren  | _this notebook_ |          
| 5      | SOIL            | static   | awc, bulk_density, drainage_class     | filter adm_ids, pivotieren                                | _this notebook_ |         


In [3]:
# CY-Bench data
#fpar = pd.read_csv("../data/CY-Bench/{}/{}/fpar_{}_{}.csv".format(country, crop, crop, country)) ignored for now because of 10-day bin logic
meteo = pd.read_csv("../data/CY-Bench/{}/{}/meteo_{}_{}.csv".format(country, crop, crop, country))
ndvi = pd.read_csv("../data/CY-Bench/{}/{}/ndvi_{}_{}.csv".format(country, crop, crop, country))
soil_moisture = pd.read_csv("../data/CY-Bench/{}/{}/soil_moisture_{}_{}.csv".format(country, crop, crop, country))
soil = pd.read_csv("../data/CY-Bench/{}/{}/soil_{}_{}.csv".format(country, crop, crop, country))
yield_data = pd.read_csv("../data/CY-Bench/{}/{}/yield_{}_{}.csv".format(country, crop, crop, country))

In [6]:
meteo = meteo.drop(columns=["tmin", "tmax", "tavg", "prec"])

### 3. Preprocess

In [8]:
cy_bench_data = [soil_moisture, ndvi, soil, meteo]
relevant_adm_ids = yield_data["adm_id"].unique()
cy_bench_data = preprocess.filter_predictors_by_adm_ids(cy_bench_data, relevant_adm_ids)
soil_moisture, ndvi, soil, meteo = cy_bench_data

temporal_cy_bench_data = [soil_moisture, ndvi, meteo]
temporal_cy_bench_data = preprocess.preprocess_temporal_data(temporal_cy_bench_data, crop_season_in_days_of_year[0], crop_season_in_days_of_year[1])

soil_moisture, ndvi, meteo = temporal_cy_bench_data

In [9]:
predictors = soil_moisture.merge(ndvi.merge(meteo, on=["adm_id", "harvest_year"], how="inner"), on=["adm_id", "harvest_year"], how="inner").merge(soil.drop("crop_name", axis=1), on="adm_id", how="left")

In [10]:
predictors

Unnamed: 0,adm_id,harvest_year,rsm_16,rsm_17,rsm_18,rsm_19,rsm_20,rsm_21,rsm_22,rsm_23,...,rad_35,rad_36,rad_37,rad_38,rad_39,rad_40,rad_41,awc,bulk_density,drainage_class
0,BR1708205,2003,337.404655,315.760456,319.325008,312.462204,299.460297,292.236225,289.530174,285.399044,...,1.811274e+07,2.003537e+07,2.212762e+07,1.675602e+07,1.972819e+07,2.145100e+07,1.715350e+07,9.649961,1.412437,4
1,BR1708205,2004,382.519207,377.847706,365.390263,357.854755,343.191765,329.884411,317.256702,307.878769,...,2.001343e+07,2.056684e+07,1.956121e+07,2.246551e+07,2.118079e+07,2.029737e+07,1.903996e+07,9.649961,1.412437,4
2,BR1708205,2005,367.647449,353.895252,351.147282,334.032978,322.060635,313.053799,304.589188,297.339233,...,2.287863e+07,2.194676e+07,2.123775e+07,2.252656e+07,1.837911e+07,2.114450e+07,2.039334e+07,9.649961,1.412437,4
3,BR1708205,2006,387.532467,374.339924,361.445862,349.976894,336.856499,324.917526,316.450485,308.504227,...,1.921574e+07,1.730705e+07,1.834763e+07,1.961349e+07,1.916071e+07,2.100401e+07,2.101895e+07,9.649961,1.412437,4
4,BR1708205,2007,328.856918,316.454212,318.881104,312.708534,299.599422,292.405163,287.853958,284.415524,...,2.349868e+07,2.081168e+07,2.184545e+07,2.182931e+07,2.087178e+07,2.073834e+07,2.140030e+07,9.649961,1.412437,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33154,BR5300108,2019,322.708698,324.846794,311.965908,302.782288,298.143192,294.016689,290.031528,286.731789,...,2.414195e+07,2.196093e+07,2.318530e+07,2.307508e+07,2.380996e+07,1.905182e+07,1.671290e+07,12.719206,1.223032,5
33155,BR5300108,2020,346.378040,347.674118,344.329533,332.235607,322.841179,319.498642,314.439663,309.526657,...,2.281819e+07,2.017720e+07,1.783677e+07,1.854395e+07,2.079530e+07,1.793646e+07,1.660241e+07,12.719206,1.223032,5
33156,BR5300108,2021,310.081059,299.494350,291.828041,286.936256,287.716194,288.303307,283.227303,282.154102,...,1.956095e+07,2.044334e+07,2.238928e+07,1.912514e+07,1.257663e+07,1.725934e+07,2.727122e+07,12.719206,1.223032,5
33157,BR5300108,2022,314.599403,312.273285,307.376408,302.051262,298.907875,296.157715,293.513729,290.750195,...,2.560062e+07,2.184794e+07,2.038247e+07,1.672491e+07,2.013616e+07,2.120185e+07,1.860590e+07,12.719206,1.223032,5


In [11]:
predictors.to_csv("../data/preprocessed/{}/ndvi_soil_soil_moisture_{}_{}.csv".format(country, crop, country), index=False)