In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from models.vanillann import YieldDataset, SimpleModel

ecmwf_path = "data/preprocessed/BR/ecmwf_era_wheat_BR.csv"
predictor_path = "data/preprocessed/BR/ndvi_soil_soil_moisture_wheat_BR.csv"
yield_path = "data/CY-Bench/BR/wheat/yield_wheat_BR.csv"
test_years = [2006, 2015, 2017]

%load_ext autoreload
%autoreload 2

In [2]:
def get_yield_and_predictors(yield_path, predictor_path, ecmwf_path, test_years):
    y = pd.read_csv(yield_path)
    y = y.loc[y["harvest_year"].between(2003, 2023), ["adm_id", "harvest_year", "yield", "harvested_area"]].reset_index(drop=True)

    # Merge predictor data
    x_1 = pd.read_csv(ecmwf_path)
    x_2 = pd.read_csv(predictor_path)
    x = x_1.merge(x_2, on=["adm_id", "harvest_year"], how="left").dropna().reset_index(drop=True)

    # Merge predictor and yield data
    x_y = x.merge(y, on=["adm_id", "harvest_year"], how="inner")

    train_df = x_y[~x_y['harvest_year'].isin(test_years)].reset_index(drop=True)
    test_df = x_y[x_y['harvest_year'].isin(test_years)].reset_index(drop=True)
    
    return train_df, test_df

In [3]:
train_df, test_df = get_yield_and_predictors(yield_path, predictor_path, ecmwf_path, test_years)

In [9]:
train_df.loc[pd.to_datetime(train_df["init_date"]).dt.month == 12, "drainage_class"].unique()

array([4., 5., 6., 2., 3.])

In [None]:
end_of_season_df = train_df[pd.to_datetime(train_df["init_date"]).dt.month == 12].drop(columns=["init_date", "harvested_area"]).set_index("adm_id")

In [None]:
def RMSELoss(yhat,y):
    return 100 * torch.sqrt(torch.mean((yhat-y)**2)) / torch.mean(y)

In [None]:
unique_years = end_of_season_df['harvest_year'].unique()
unique_years.sort()
results = dict.fromkeys(unique_years)

batch_size = 64

for year in unique_years:
    print(f'Validating on year {year}')
    
    # Create training and validation sets for this fold
    train_fold_df = end_of_season_df[end_of_season_df['harvest_year'] != year]
    val_fold_df = end_of_season_df[end_of_season_df['harvest_year'] == year]
    
    train_fold_features = train_fold_df[[c for c in train_fold_df.columns if c not in no_feature]]
    train_fold_target = train_fold_df['yield']
    val_fold_features = val_fold_df[[c for c in val_fold_df.columns if c not in no_feature]]
    val_fold_target = val_fold_df['yield']
    
    #means = train_fold_features.mean()
    #stds = train_fold_features.std()
    #train_fold_features = (train_fold_features - means) / stds
    #val_fold_features = (val_fold_features - means) / stds
    
    train_fold_dataset = YieldDataset(train_fold_features, train_fold_target)
    val_fold_dataset = YieldDataset(val_fold_features, val_fold_target)
    
    train_fold_loader = DataLoader(train_fold_dataset, batch_size=batch_size, shuffle=True)
    val_fold_loader = DataLoader(val_fold_dataset, batch_size=batch_size, shuffle=False)
    
    # Reset the model and optimizer
    model = SimpleModel()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = RMSELoss
    
    # Early stopping parameters
    num_epochs = 10  # Set the maximum number of epochs you want to train for
    patience = 4  # Number of epochs to wait for improvement before stopping
    best_val_loss = float('inf')  # Initialize the best validation loss
    epochs_no_improve = 0  # Counter for epochs without improvement
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for features, target in train_fold_loader:
            optimizer.zero_grad()
            output = model(features)
            loss = criterion(output, target.unsqueeze(1))
            loss.backward()
            optimizer.step()
        
        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, target in val_fold_loader:
                output = model(features)
                loss = criterion(output, target.unsqueeze(1))
                val_loss += loss.item()
        
        val_loss /= len(val_fold_loader)  # Compute the average validation loss
        print(f'Epoch {epoch + 1}, Validation Loss for year {year}: {val_loss}')
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0  # Reset the counter if validation loss improves
        else:
            epochs_no_improve += 1  # Increment the counter if validation loss does not improve
        
        if epochs_no_improve >= patience:
            print(f'Early stopping at epoch {epoch + 1}')
            break  # Stop training if no improvement for specified number of epochs
    
    results[year] = best_val_loss
        
# Once cross-validation is done, you can test on the test dataset using test_loader


Validating on year 2003
Epoch 1, Validation Loss for year 2003: 59.422749739426834
Epoch 2, Validation Loss for year 2003: 55.78332490187425
Epoch 3, Validation Loss for year 2003: 43.948916508601265
Epoch 4, Validation Loss for year 2003: 42.30251341599684
Epoch 5, Validation Loss for year 2003: 37.046658736008865
Epoch 6, Validation Loss for year 2003: 34.854401955237755
Epoch 7, Validation Loss for year 2003: 42.19561503483699
Epoch 8, Validation Loss for year 2003: 40.0683112511268
Epoch 9, Validation Loss for year 2003: 32.90455920879658
Epoch 10, Validation Loss for year 2003: 35.09526062011719
Validating on year 2004
Epoch 1, Validation Loss for year 2004: 39.46417958395822
Epoch 2, Validation Loss for year 2004: 34.631209237234934


KeyboardInterrupt: 