In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from modules import preprocess

# Enable autoreload for Jupyter notebooks
%load_ext autoreload
%autoreload 2


### 1. Define study crop and country

In [3]:
# USER INPUTS
country = "BR" # one of ["US", "BR"]
crop = "wheat" # one of ["maize", "wheat"]

_, crop_season_in_days_of_year, crop_season_in_months, crop_season_in_time_steps = preprocess.set_crop_season(country, crop)

print(crop, country, crop_season_in_days_of_year, crop_season_in_months, crop_season_in_time_steps)

wheat BR (129, 329) (5, 11) (16, 41)


### 2. Read data

From CY-Bench, we have five predictor datasets. 

| **ID** | **Name**        | **Time** | **Variables**                         | **Steps**                                    | **Notebook**   |
|--------|-----------------|----------|---------------------------------------|----------------------------------------------|----------------|
| 1      | FPAR            | bins     | fpar                                  | filter adm_ids, filter crop season           | _on hold because of 10-day bins_ |            
| 2      | METEO           | daily    | tmin, tmax, prec, rad, tavg, et0, cwb | filter adm_ids, filter crop season,<br>resample to 8-day bins  |_3_ecmwf_preprocessing.ipynb_ |       
| 3      | NDVI            | bins     | ndvi                                  | filter adm_ids, filter crop season, pivotieren           | _this notebook_ |           
| 4      | SOIL MOISTURE   | daily    | ssm, rsm                              | filter adm_ids, filter crop season, <br>resample to 8-day bins, pivotieren  | _this notebook_ |          
| 5      | SOIL            | static   | awc, bulk_density, drainage_class     | filter adm_ids, pivotieren                                | _this notebook_ |         


In [22]:
# CY-Bench data
#fpar = pd.read_csv("../data/CY-Bench/{}/{}/fpar_{}_{}.csv".format(country, crop, crop, country)) ignored for now because of 10-day bin logic
ndvi = pd.read_csv("../data/CY-Bench/{}/{}/ndvi_{}_{}.csv".format(country, crop, crop, country))
soil_moisture = pd.read_csv("../data/CY-Bench/{}/{}/soil_moisture_{}_{}.csv".format(country, crop, crop, country))
soil = pd.read_csv("../data/CY-Bench/{}/{}/soil_{}_{}.csv".format(country, crop, crop, country))
yield_data = pd.read_csv("../data/CY-Bench/{}/{}/yield_{}_{}.csv".format(country, crop, crop, country))

### 3. Preprocess

In [23]:
cy_bench_data = [soil_moisture, ndvi, soil]
relevant_adm_ids = yield_data["adm_id"].unique()
cy_bench_data = preprocess.filter_predictors_by_adm_ids(cy_bench_data, relevant_adm_ids)
soil_moisture, ndvi, soil = cy_bench_data

temporal_cy_bench_data = [soil_moisture, ndvi]
temporal_cy_bench_data = preprocess.preprocess_temporal_data(temporal_cy_bench_data, crop_season_in_days_of_year[0], crop_season_in_days_of_year[1])

soil_moisture, ndvi = temporal_cy_bench_data

predictors = soil_moisture.merge(ndvi, on=["adm_id", "harvest_year"], how="inner").merge(soil.drop("crop_name", axis=1), on="adm_id", how="left")

In [29]:
predictors.to_csv("../data/preprocessed/{}/ndvi_soil_soil_moisture_{}_{}.csv".format(country, crop, country))