# Introduction

After looking at the FutureCrop data, it appears that there's a large variance across locations in the mean yield and also the variance.

Here we will try to model each location independently- we then will try to scale this up to train the many models in parallel on a GPU using pytorch.

The idea is to learn the minimal mapping from time-series data to a prediction of the yield.

**Notes**:
A simple recurrent model just learns to predict the mean across all training batches of a single location. This is true for tiny (1-unit), shallow (100-unit wide) and deep networks (2-4 layers).
However, models taking the entire sequence of weather as a single vector learn to do pattern recognition and generalise.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data_dir = '/kaggle/input/the-future-crop-challenge/'
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-future-crop-challenge/pr_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tasmax_maize_train.parquet
/kaggle/input/the-future-crop-challenge/sample_submission.csv
/kaggle/input/the-future-crop-challenge/soil_co2_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tas_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/rsds_maize_train.parquet
/kaggle/input/the-future-crop-challenge/tasmin_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/tasmax_wheat_train.parquet
/kaggle/input/the-future-crop-challenge/rsds_maize_test.parquet
/kaggle/input/the-future-crop-challenge/soil_co2_maize_test.parquet
/kaggle/input/the-future-crop-challenge/train_solutions_maize.parquet
/kaggle/input/the-future-crop-challenge/pr_maize_test.parquet
/kaggle/input/the-future-crop-challenge/tas_wheat_test.parquet
/kaggle/input/the-future-crop-challenge/tasmax_maize_test.parquet
/kaggle/input/the-future-crop-challenge/pr_maize_train.parquet
/kaggle/input/the-fu

In [2]:
#climate data (timeseries)

#soil_co2 and yields

wheat_df = pd.read_parquet('/kaggle/input/the-future-crop-challenge/soil_co2_wheat_train.parquet')
wheat_yield = pd.read_parquet('/kaggle/input/the-future-crop-challenge/train_solutions_wheat.parquet')
wheat_df = wheat_df.join(wheat_yield)

maize_df = pd.read_parquet('/kaggle/input/the-future-crop-challenge/soil_co2_maize_train.parquet')
maize_yield = pd.read_parquet('/kaggle/input/the-future-crop-challenge/train_solutions_maize.parquet')
maize_df = maize_df.join(maize_yield)

mean_df = pd.DataFrame()
for crop_df in [wheat_df,maize_df]:
    temp_df = crop_df.groupby(['crop','lon','lat'], as_index = False).agg({'yield':['mean','std']})
    temp_df.columns = ['crop','lon','lat','yield_mean','yield_std']
    mean_df = pd.concat([mean_df,temp_df])

mean_df

Unnamed: 0,crop,lon,lat,yield_mean,yield_std
0,wheat,-123.25,44.75,4.965216,0.488492
1,wheat,-123.25,45.25,4.985947,0.501791
2,wheat,-123.25,45.75,4.822316,0.423168
3,wheat,-122.75,44.75,4.875486,0.537611
4,wheat,-122.75,45.25,5.379421,0.615185
...,...,...,...,...,...
9298,maize,132.75,46.75,5.738692,1.191923
9299,maize,132.75,47.25,8.622872,1.518404
9300,maize,133.25,45.25,2.470256,0.359327
9301,maize,133.25,47.25,6.932128,1.339325


In [84]:
idx_lon = 132.75
idx_lat = 47.25

static_data = maize_df.query(f'lon=={idx_lon} and lat=={idx_lat}')[['co2','nitrogen','yield']]

crop = 'maize'
mode = 'train'

tasmax = pd.read_parquet(os.path.join(data_dir, f"tasmax_{crop}_{mode}.parquet")).query(f'lon=={idx_lon} and lat=={idx_lat}')
tasmin = pd.read_parquet(os.path.join(data_dir, f"tasmin_{crop}_{mode}.parquet")).query(f'lon=={idx_lon} and lat=={idx_lat}')
pr = pd.read_parquet(os.path.join(data_dir, f"pr_{crop}_{mode}.parquet")).query(f'lon=={idx_lon} and lat=={idx_lat}')
rsds = pd.read_parquet(os.path.join(data_dir, f"rsds_{crop}_{mode}.parquet")).query(f'lon=={idx_lon} and lat=={idx_lat}')

climate_data = np.stack([
        tasmax.iloc[:, 5:].values,
        tasmin.iloc[:, 5:].values,
        pr.iloc[:, 5:].values,
        rsds.iloc[:, 5:].values
    ], axis=2)

static_expanded = np.repeat(static_data.values[:,np.newaxis,:],240,axis=1)

sequenced_data = np.concatenate([climate_data,static_expanded],axis = 2)

sequenced_data.shape

(39, 240, 7)

In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# --- Data Splitting ---
inputs = sequenced_data[:, :, :-1]
targets = sequenced_data[:, -1, -1]

# --- 1. Configuration ---
n_batch = sequenced_data.shape[0] # Using full dataset as one batch
n_seq = sequenced_data.shape[1]
n_features = sequenced_data.shape[2] - 1
n_hidden = 4
n_layers = 1
learning_rate = 1e-1
n_epochs = 1000

# Regularization Hyperparameter (Adjust if needed)
HIDDEN_REG_ALPHA = 1e-4 

# Device configuration (Move all data/model to the device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"--- Configuration Summary ---")
print(f"Data Shape (Batch, Seq, Feat): ({n_batch}, {n_seq}, {n_features})")
print(f"Hidden Size/Layers: {n_hidden}/{n_layers}")
print(f"Learning Rate/Epochs: {learning_rate}/{n_epochs}")
print(f"Using Device: {device}")
print(f"-----------------------------")


# --- 2. PyTorch Dataset & DataLoader ---
class SequenceDataset(Dataset):
    def __init__(self, X, Y):
        # Move inputs to device and set dtype
        self.X = torch.from_numpy(X).to(device=device, dtype=torch.float32)
        # Move targets to device, set dtype, and unsqueeze to [batch, seq, 1] for loss calculation
        self.Y = torch.from_numpy(Y).to(device=device, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Since the DataLoader batch_size is the full dataset size, 
        # this will effectively return the entire X and Y tensors on the first call.
        return self.X[idx], self.Y[idx]

# Create DataLoader
dataset = SequenceDataset(inputs, targets)
# DataLoader batch_size is n_batch, so we get one large batch: (1, n_batch, n_seq, n_features)
dataloader = DataLoader(dataset, batch_size=n_batch, shuffle=False) 

# --- 3. Model Definition and Xavier Initialization ---

class SimpleRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(SimpleRNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Using GRU as in the previous example
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self._init_weights()

    def _init_weights(self):
        # Xavier/Glorot Initialization
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 1)
        print("Model weights initialized with Xavier/Glorot.")

    def forward(self, x):
        # out: (batch_size, n_seq, hidden_size) - Sequence of hidden states
        out, _ = self.rnn(x)  
        
        # final_out: (batch_size, n_seq, output_size) - Final prediction
        final_out = self.fc(out)
        
        # Return both prediction and hidden states for regularization
        return final_out, out 

# Instantiate the model
model = SimpleRNNModel(
    input_size=n_features, 
    hidden_size=n_hidden, 
    num_layers=n_layers, 
    output_size=1
).to(device) # Ensure model is also on the device

# --- 4. Loss and Optimizer ---

# SmoothL1Loss for robustness
criterion_primary = nn.MSELoss().to(device) 
# AdamW Optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# --- 5. Training Loop ---

print("-" * 30)
print(f"Starting Training for {n_epochs} epochs...")
model.train() 

for epoch in range(1, n_epochs + 1):
    epoch_loss = 0
    
    for inputs_batch, targets_batch in dataloader:
        # Data is already on the device due to the Dataset implementation
        # Forward pass:
        outputs, hidden_states = model(inputs_batch)
        
        # 1. Primary Loss (Smooth L1 Loss)
        primary_loss = criterion_primary(outputs[:,-1,-1], targets_batch)

        # 2. Hidden State Regularization Loss (L2 Norm Squared)
        hidden_reg_loss = torch.norm(hidden_states, p=2)**2 * HIDDEN_REG_ALPHA
        
        # 3. Total Loss
        loss = primary_loss + hidden_reg_loss
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    # Reporting every 1000 epochs to avoid excessive output
    if epoch % 100 == 0 or epoch == 1:
        avg_loss = epoch_loss / len(dataloader)
        print(f'Epoch [{epoch:05d}/{n_epochs}], Total Loss: {avg_loss:.6f} (Primary Loss: {primary_loss.item():.6f}, Reg Loss: {hidden_reg_loss.item():.6f})')

print("-" * 30)
print("Training Complete!")

--- Configuration Summary ---
Data Shape (Batch, Seq, Feat): (39, 240, 6)
Hidden Size/Layers: 4/1
Learning Rate/Epochs: 0.1/1000
Using Device: cuda
-----------------------------
Model weights initialized with Xavier/Glorot.
------------------------------
Starting Training for 1000 epochs...
Epoch [00001/1000], Total Loss: 60.354610 (Primary Loss: 60.354610, Reg Loss: 0.000000)
Epoch [00100/1000], Total Loss: 3.373423 (Primary Loss: 3.373423, Reg Loss: 0.000000)
Epoch [00200/1000], Total Loss: 2.311594 (Primary Loss: 2.311594, Reg Loss: 0.000000)
Epoch [00300/1000], Total Loss: 2.285419 (Primary Loss: 2.285419, Reg Loss: 0.000000)
Epoch [00400/1000], Total Loss: 2.273779 (Primary Loss: 2.273779, Reg Loss: 0.000000)
Epoch [00500/1000], Total Loss: 2.267073 (Primary Loss: 2.267073, Reg Loss: 0.000000)
Epoch [00600/1000], Total Loss: 2.262702 (Primary Loss: 2.262702, Reg Loss: 0.000000)
Epoch [00700/1000], Total Loss: 2.259630 (Primary Loss: 2.259630, Reg Loss: 0.000000)
Epoch [00800/1000]

## Learning a time-series -> yield function

Here we're very data-limited using only a single location. There are 240x4+1 inputs, but only 39 training examples. 

1. DNN with dropout (for generalisation). With a VAE-like dimension reduction layer-by-layer.

One idea would be to do a convolution over the 240x4 inputs and reduce them to an N dimensional signal, which is then combined with the C02 data to produce a yield estimate. This might not be crazy at all.



In [78]:
# DNN with dropout
# simply map continous data onto the yield with an N-deep network trained with dropout.

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# --- 2. PyTorch Dataset & DataLoader (Updated for 1D target) ---
class FlattenedDataset(Dataset):
    def __init__(self, climate_data, soil_co2, crop_yield):
        """Takes numpy arrays as inputs.
        climate_data shaped as (n_batch, n_seq, n_features)
        soil_co2 and crop_yield both 1d arrays"""
        N, S, F = climate_data.shape
        # X: Flattened to [N, S * F]
        self.X = torch.from_numpy(climate_data).reshape(N, S * F)
        self.X = torch.concat([self.X,torch.from_numpy(soil_co2).unsqueeze(-1)],axis=1).to(device, dtype=torch.float32)
        # Y: Unsqueeze to [N, 1] for consistent loss calculation
        self.Y = torch.from_numpy(crop_yield).unsqueeze(-1).to(device=device, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# --- 3. Flexible Model Definition and Xavier Initialization (DNN) ---

class FlexibleFlattenedDNN(nn.Module):
    def __init__(self, input_size, output_size, num_layers, reduction_ratio, dropout_rate=0.0, leaky_relu_slope =0.01):
        super(FlexibleFlattenedDNN, self).__init__()
        self.leaky_slope = leaky_relu_slope
        layers = []
        current_size = input_size
        
        # Dynamically build the hidden layers
        for i in range(num_layers):
            # Calculate the size of the next layer
            next_size = max(4, int(current_size * reduction_ratio)) # Min size of 4 for stability
            
            # Add Linear Layer
            layers.append(nn.Linear(current_size, next_size))
            # Add Activation
            layers.append(nn.LeakyReLU(self.leaky_slope))
            
            # Add Dropout (only for intermediate layers)
            if dropout_rate > 0 and i < num_layers - 1:
                 layers.append(nn.Dropout(dropout_rate)) 
                
            current_size = next_size
        
        # Add the final output layer (no activation or dropout after this)
        layers.append(nn.Linear(current_size, output_size))
        
        self.fc_stack = nn.Sequential(*layers)
        
        print(f"DNN Architecture built: {input_size} -> {[l.out_features for l in layers if isinstance(l, nn.Linear)]}")
        
        self._init_weights()

    def _init_weights(self, weight_seed = 1):
        # Xavier/Glorot Initialization for all Linear layers
        torch.random.seed = weight_seed
        for m in self.modules():
            if isinstance(m, nn.Linear):
                #nn.init.xavier_uniform_(m.weight)
                nn.init.kaiming_normal_(m.weight, a= self.leaky_slope)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        print("Model weights initialized with Xavier/Glorot.")

    def forward(self, x):
        return self.fc_stack(x) 

# --- 1. Configuration ---
n_batch = sequenced_data.shape[0] 
n_seq = sequenced_data.shape[1]   
n_features = sequenced_data.shape[2] - 1

# --- MODEL FLEXIBILITY PARAMETERS ---
NUM_HIDDEN_LAYERS = 3     # The number of layers between input and output (e.g., 3 for 4 layers total)
REDUCTION_RATIO = 1/6      # The ratio by which each layer size decreases (e.g., 0.5 means half the size)
DROPOUT_RATE = 2/5        # Dropout rate for intermediate layers (0.0 for no dropout)

# --- CALCULATED DIMENSIONS ---
FLATTENED_INPUT_SIZE = n_seq * 4+1 
TARGET_OUTPUT_SIZE = 1 # Corrected to 1D output

# Hyperparameter
init_LR = 3e-9
max_LR = 9e-3
weight_decay = 1e-5 #suggested to be smaller for 'super-convergence' in OneCycleLR paper.
n_epochs = 1000


# Create DataLoader
dataset = FlattenedDataset(climate_data, static_data.co2.values,static_data['yield'].values)
n_train = int(dataset.X.shape[0]*0.8)
n_test = dataset.X.shape[0]-n_train

train_dataset = torch.utils.data.Subset(dataset, range(n_train))
val_dataset = torch.utils.data.Subset(dataset,range(n_train,n_train+n_test))

train_loader = DataLoader(train_dataset, batch_size=n_batch, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = n_batch, shuffle = False)

# Instantiate the flexible model
model = FlexibleFlattenedDNN(
    input_size=FLATTENED_INPUT_SIZE,
    output_size=TARGET_OUTPUT_SIZE,
    num_layers=NUM_HIDDEN_LAYERS,
    reduction_ratio=REDUCTION_RATIO,
    dropout_rate=DROPOUT_RATE
).to(device, dtype=torch.float32)

n_params = sum([p.numel() for p in model.parameters()])
print(f"Model instantiatied with {n_params} parameters.")
# --- 4. Loss and Optimizer ---

# Switched back to MSELoss as requested in the original code block
criterion_primary = nn.MSELoss().to(device) 
optimizer = optim.AdamW(model.parameters(), lr=init_LR, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_LR, 
                                          steps_per_epoch=1, 
                                          epochs=n_epochs)

# --- 5. Training Loop (Unchanged) ---

print("-" * 30)
print(f"Starting Training for {n_epochs} epochs...")
model.train() 

for epoch in range(1, n_epochs + 1):
    for inputs_batch, targets_batch in train_loader:
        # Backward and optimize
        optimizer.zero_grad()
        
        # Forward pass:
        outputs = model(inputs_batch)
        
        # Calculate loss (outputs: [N, 1], targets: [N, 1])
        loss = criterion_primary(outputs, targets_batch)
        
        
        loss.backward()
        optimizer.step()
        scheduler.step()

    if epoch % 1000 == 0 or epoch == 1:
        avg_loss = loss 
        print(f'Epoch [{epoch:05d}/{n_epochs}], Loss: {avg_loss:.6f}')
        model.eval()
        with torch.no_grad():
            val_losses = []
            for val_inputs, val_targets in val_loader:
                val_preds = model(val_inputs)
                val_losses.append(criterion_primary(val_preds,val_targets).item())

        print(f'Validation loss: {np.mean(val_losses):.6f}')
print("-" * 30)
print("Training Complete!")

DNN Architecture built: 961 -> [160, 26, 4, 1]
Model weights initialized with Xavier/Glorot.
Model instantiatied with 158219 parameters.
------------------------------
Starting Training for 1000 epochs...
Epoch [00001/1000], Loss: 16278.957031
Validation loss: 1314.556519
Epoch [01000/1000], Loss: 0.000000
Validation loss: 5.174515
------------------------------
Training Complete!


In [79]:
# the predictions are no longer constant means for every timepont, but are matching patterns.
# this suggests that information is being pulled out of the sequence.
print(val_preds.T, '\n',val_targets.T)

tensor([[11.6607,  8.7769,  9.5265,  5.4197,  7.2092,  9.7951,  9.5484,  6.9533]],
       device='cuda:0') 
 tensor([[ 9.5290,  7.0030,  8.6340,  7.4670, 11.5860,  8.8950,  7.3440,  8.9260]],
       device='cuda:0')


In [64]:
for name, params in model.named_parameters():
    print(name)

model.fc_stack[(0)].weight[]

fc_stack.0.weight
fc_stack.0.bias
fc_stack.3.weight
fc_stack.3.bias
fc_stack.6.weight
fc_stack.6.bias
fc_stack.8.weight
fc_stack.8.bias


tensor([ 0.0781,  0.0149, -0.0588, -0.0413, -0.0362, -0.0397,  0.0411,  0.0031,
        -0.0608, -0.0208, -0.0488,  0.0084,  0.0646, -0.0818, -0.0814, -0.0193,
        -0.0359, -0.0204,  0.0859, -0.0047, -0.0289, -0.0616, -0.0333,  0.0642,
         0.0704, -0.0371,  0.0234,  0.0768,  0.0118, -0.0679,  0.0206, -0.0010,
        -0.0605, -0.0812, -0.0428, -0.0416, -0.0118, -0.0483,  0.0239, -0.0181,
        -0.0075, -0.0717,  0.0292,  0.0408, -0.0871, -0.0020, -0.0265, -0.0811,
        -0.0300,  0.0800, -0.0514,  0.0386, -0.0690, -0.0437,  0.0354, -0.0230,
        -0.0456,  0.0232, -0.0515, -0.0029, -0.0225,  0.0355,  0.0393, -0.0589,
         0.0884, -0.0167, -0.0537,  0.0057, -0.0103, -0.0388,  0.0207,  0.0920,
        -0.0072,  0.0334, -0.0164,  0.0446, -0.0403, -0.0480, -0.0126,  0.0709,
        -0.0071, -0.0423, -0.0876,  0.0138, -0.0053,  0.0341,  0.0070, -0.0136,
        -0.0116, -0.0288,  0.0078, -0.0319, -0.0338, -0.0095, -0.0179,  0.0280,
         0.0325, -0.0699, -0.0515, -0.00

In [84]:
print(val_preds.T, '\n',val_targets.T)

tensor([[8.6046, 8.6046, 8.6046, 8.6046, 8.6046, 8.6046, 8.6046, 8.6046]]) 
 tensor([[ 9.5290,  7.0030,  8.6340,  7.4670, 11.5860,  8.8950,  7.3440,  8.9260]])


## submission

Let's try running this to submission.

Start with a dataframe that we can index out by crop and position, then train on all train data and evaluate on the test data.



In [3]:
submission_csv = pd.read_csv(data_dir+'sample_submission.csv')

submission_csv #pandas dataframe with 'ID' and 'yield' column
submission_csv.index = submission_csv['ID']
submission_csv

Unnamed: 0_level_0,ID,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
349719,349719,1.586834
349720,349720,16.060235
349721,349721,3.783943
349722,349722,16.078949
349723,349723,18.245241
...,...,...
1873717,1873717,13.696562
1873718,1873718,18.601414
1873719,1873719,7.305126
1873720,1873720,12.617616


In [7]:
# we want to end up with a dataframe that is flattened anyways. So we can still use a pandas dataframe that is just concatenated
import gc

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using torch on  ', DEVICE)

data_dict = {} #containing pytorch dataframes on device - for training, the yield is appended as the last index
idx_dict = {} #containing location to idx dictionaries
ID_dict = {} #containing location to ID dictionaries (primarily for submission alignment)

for test_or_train in ['train','test']:
    data_dict[test_or_train] = {}
    idx_dict[test_or_train] = {}
    ID_dict[test_or_train] = {}
    for each_crop in ['maize','wheat']:
        #Load all the data first:
        static_data = pd.read_parquet(data_dir+f'soil_co2_{each_crop}_{test_or_train}.parquet')[['year','lon','lat','co2','nitrogen']]
        if test_or_train == 'train':
            yield_df = pd.read_parquet(data_dir+f'train_solutions_{each_crop}.parquet')
            static_data['yield'] = yield_df['yield']
            print('appended yield')
            del yield_df
        
        climate_data = []
        for data_type in ['tasmax','tasmin','pr','rsds']:
            climate_df = pd.read_parquet(data_dir+f'{data_type}_{each_crop}_{test_or_train}.parquet')
            climate_df = climate_df.drop(columns = ['crop','year','lon','lat','variable'])
            climate_df.columns = [f'{data_type}_{x}' for x in climate_df.keys()]
            climate_data.append(climate_df)

        # this is a useful dataframe with all the data:
        df = pd.concat(climate_data+[static_data],axis=1) 
        X = torch.from_numpy(df.drop(columns = ['year','lon','lat']).values).to(device=DEVICE, dtype = torch.float32) #(n_years, n_features)
        data_dict[test_or_train][each_crop] = X
        ID_dict[test_or_train][each_crop] = df.groupby(['lon','lat']).groups
        df.index = range(len(df)) #reindex to get a dictionary over indices
        idx_dict[test_or_train][each_crop] = df.groupby(['lon','lat']).groups
        #manage memory
        del static_data, climate_data, climate_df, df, X
        gc.collect()



Using torch on   cuda
appended yield
appended yield


In [29]:
## self-contained code here for training 

## define model

class FlexibleFlattenedDNN(nn.Module):
    def __init__(self, input_size, output_size, num_layers, reduction_ratio, dropout_rate=0.0, leaky_relu_slope =0.01):
        super(FlexibleFlattenedDNN, self).__init__()
        self.leaky_slope = leaky_relu_slope
        layers = []
        current_size = input_size
        
        # Dynamically build the hidden layers
        for i in range(num_layers):
            # Calculate the size of the next layer
            next_size = max(4, int(current_size * reduction_ratio)) # Min size of 4 for stability
            
            # Add Linear Layer
            layers.append(nn.Linear(current_size, next_size))
            # Add Activation
            layers.append(nn.LeakyReLU(self.leaky_slope))
            
            # Add Dropout (only for intermediate layers)
            if dropout_rate > 0 and i < num_layers - 1:
                 layers.append(nn.Dropout(dropout_rate)) 
                
            current_size = next_size
        
        # Add the final output layer (no activation or dropout after this)
        layers.append(nn.Linear(current_size, output_size))
        
        self.fc_stack = nn.Sequential(*layers)
        
        #print(f"DNN Architecture built: {input_size} -> {[l.out_features for l in layers if isinstance(l, nn.Linear)]}")
        
        self._init_weights()

    def _init_weights(self, weight_seed = 1):
        # Xavier/Glorot Initialization for all Linear layers
        torch.random.seed = weight_seed
        for m in self.modules():
            if isinstance(m, nn.Linear):
                #nn.init.xavier_uniform_(m.weight)
                nn.init.kaiming_normal_(m.weight, a= self.leaky_slope)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        #print("Model weights initialized with Xavier/Glorot.")

    def forward(self, x):
        return self.fc_stack(x) 

# --- MODEL FLEXIBILITY PARAMETERS ---
NUM_HIDDEN_LAYERS = 3     # The number of layers between input and output (e.g., 3 for 4 layers total)
REDUCTION_RATIO = 1/6      # The ratio by which each layer size decreases (e.g., 0.5 means half the size)
DROPOUT_RATE = 2/5        # Dropout rate for intermediate layers (0.0 for no dropout)

# Hyperparameters
init_LR = 3e-9
max_LR = 9e-3
weight_decay = 1e-5 #suggested to be smaller for 'super-convergence' in OneCycleLR paper.
n_epochs = 1000


#
def train_and_predict(train_data, test_data):
    # move data to device
    train_X = train_data[:,:-1] #(n_years, n_features)
    train_Y = train_data[:,-1].unsqueeze(-1)
    test_X = test_data
    
    # Instantiate the flexible model
    model = FlexibleFlattenedDNN(
        input_size=train_X.shape[1],
        output_size=1,
        num_layers=NUM_HIDDEN_LAYERS,
        reduction_ratio=REDUCTION_RATIO,
        dropout_rate=DROPOUT_RATE
    ).to(DEVICE, dtype=torch.float32)
    
    # Switched back to MSELoss as requested in the original code block
    criterion_primary = nn.MSELoss().to(DEVICE) 
    optimizer = optim.AdamW(model.parameters(), lr=init_LR, weight_decay = weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_LR,epochs = n_epochs, steps_per_epoch=1)
             
    model.train() 
    
    for epoch in range(1, n_epochs + 1):
            # Backward and optimize
            optimizer.zero_grad() 
            # Forward pass:
            outputs = model(train_X)
            # Calculate loss (outputs: [N, 1], targets: [N, 1])
            loss = criterion_primary(outputs, train_Y)
            # backprop losses
            loss.backward()
            optimizer.step()
            scheduler.step()
    #print(f'Finished training {n_epochs}')
    model.eval()
    with torch.no_grad():
        predictions = model(test_X)

    return predictions.cpu().numpy()

In [None]:
from tqdm import tqdm
# with pandas dataframes it takes ~2x40 minutes just to go through all the data.
# not ideal
for each_crop in ['maize','wheat']:
    unique_coords = idx_dict['test'][each_crop].keys()
    for lon_lat in tqdm(unique_coords):
        #load the train_data and train a model
        train_data = data_dict['train'][each_crop][idx_dict['train'][each_crop][lon_lat]]
        test_data = data_dict['test'][each_crop][idx_dict['test'][each_crop][lon_lat]]
        #run code to train and predict
        predictions = train_and_predict(train_data,test_data)
        #append to the submission csv
        location_indices = ID_dict['test'][each_crop][lon_lat]        
        submission_csv.loc[location_indices,'yield'] = predictions

# I think a better / ideal indexing approach is one that loads all the data onto pytorch tensors and indexes out from there.
# we just need some kind of location to index dictionary/look up table to index out of.

  1%|          | 91/9303 [02:20<3:55:39,  1.53s/it]

In [57]:
test_data= pandas_to_torch(data_dict['train'][each_crop])

In [59]:
test_data.X.shape

torch.Size([278747, 965])

In [56]:
data_dict['test'][each_crop]

Unnamed: 0_level_0,year,lon,lat,co2,nitrogen,tasmax_0,tasmax_1,tasmax_2,tasmax_3,tasmax_4,...,rsds_230,rsds_231,rsds_232,rsds_233,rsds_234,rsds_235,rsds_236,rsds_237,rsds_238,rsds_239
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1319737,420.0,-123.25,44.75,418.06,102.824997,31.621307,31.037140,27.053986,27.319489,30.649078,...,174.14244,150.82764,190.36435,194.51694,100.82088,193.686020,242.53871,173.61038,158.46439,111.906140
1319738,420.0,-123.25,45.25,418.06,102.824997,29.953460,30.332184,24.672333,26.461731,29.035553,...,201.55307,153.14207,190.76508,193.85138,107.75983,198.955750,227.22021,172.88612,167.09686,101.487076
1319739,420.0,-123.25,45.75,418.06,102.824997,26.864685,26.516113,21.986877,23.700531,25.674530,...,202.85130,158.17982,185.61537,199.27208,100.38560,208.031400,212.48820,165.25601,158.21384,136.892970
1319740,420.0,-122.75,44.75,418.06,102.824997,31.807922,31.897370,26.641174,27.302338,30.102540,...,162.90735,153.26567,192.76870,201.41548,99.03610,199.094280,248.47937,175.88953,170.87689,99.091990
1319741,420.0,-122.75,45.25,418.06,102.824997,31.803009,31.503723,26.558655,28.035553,31.034210,...,178.90230,155.62582,188.06500,201.15889,94.03270,212.796590,232.24763,183.09230,176.92282,135.018230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873717,497.0,152.25,-29.25,1107.89,40.074001,23.115448,22.860565,22.565948,23.216522,24.566772,...,357.75784,236.90488,146.83035,221.42387,157.71758,59.072372,250.26380,257.25482,290.07028,269.646880
1873718,497.0,152.25,-28.75,1107.89,40.074001,23.991821,23.926483,23.554413,24.288849,25.432434,...,357.93820,250.80669,174.62518,233.58775,169.67963,74.146860,250.39180,293.86765,284.58230,241.260910
1873719,497.0,152.25,-28.25,1107.89,40.074001,24.289520,24.710602,24.661255,25.235870,26.482025,...,357.58700,267.46405,187.97968,231.53940,172.55563,72.186110,245.47177,301.01040,272.35388,226.484740
1873720,497.0,152.25,-27.75,1107.89,40.074001,28.348694,29.666810,29.559906,29.040924,28.703827,...,192.38712,73.39178,185.05464,165.53415,81.64291,276.906770,284.31403,275.35962,283.39215,240.531940


In [43]:
train_df

Unnamed: 0_level_0,year,lon,lat,co2,nitrogen,tasmax_0,tasmax_1,tasmax_2,tasmax_3,tasmax_4,...,rsds_230,rsds_231,rsds_232,rsds_233,rsds_234,rsds_235,rsds_236,rsds_237,rsds_238,rsds_239
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,381.0,-122.25,48.25,340.79,186.110992,13.40979,12.907227,13.133698,21.76529,21.10147,...,26.62301,24.254791,31.233927,25.50236,6.170411,20.630627,47.028236,66.11088,13.543945,17.31382
9047,382.0,-122.25,48.25,342.2,186.110992,16.964447,16.890717,17.446442,10.807312,11.372833,...,33.766327,14.601289,9.326722,7.1168,5.677502,43.132854,32.291283,33.324802,58.5558,45.64549
17991,383.0,-122.25,48.25,343.78,186.110992,11.186615,9.690308,10.266022,13.817993,13.680969,...,18.248117,24.222399,42.42252,21.283678,11.998741,43.81776,22.971113,8.032106,14.891586,62.369316
26895,384.0,-122.25,48.25,345.28,186.110992,13.29126,12.943298,14.364929,15.389404,14.212402,...,24.87617,20.465246,46.87201,76.88193,56.377876,30.735733,31.278101,30.236311,25.51701,10.375072
35906,385.0,-122.25,48.25,346.8,186.110992,6.532745,6.931732,9.670837,13.058441,12.940735,...,21.27492,22.775616,24.240604,8.968406,38.302475,70.83286,73.30237,8.344541,14.387097,13.694097
44909,386.0,-122.25,48.25,348.65,186.110992,11.285065,9.157837,12.725403,14.007843,12.776703,...,28.517231,40.69449,69.11553,71.98554,72.28799,73.70672,72.545456,68.65303,58.677704,18.61732
53767,387.0,-122.25,48.25,350.74,186.110992,14.029297,14.028046,13.040009,11.844055,8.548218,...,30.182318,27.349802,19.028444,56.45831,72.35992,67.21317,14.359828,65.496796,17.43724,11.649876
62770,388.0,-122.25,48.25,352.49,186.110992,9.260345,9.119843,13.236603,11.942352,11.586243,...,23.948435,57.682865,50.954594,78.83086,52.05917,11.51858,24.375566,34.58879,64.26975,71.74892
71761,389.0,-122.25,48.25,353.86,186.110992,8.13385,11.753265,11.375,11.365082,10.81012,...,80.35047,89.69652,88.458954,76.85661,10.951805,23.442772,10.872301,20.814053,17.406881,36.43731
80796,390.0,-122.25,48.25,355.02,186.110992,12.79895,13.013062,12.79599,13.321564,15.811218,...,26.063366,35.390804,15.406098,31.865671,48.451645,59.080566,53.679123,57.162918,52.388596,60.22134


##  train_locs.groupby(['lon','lat']).groupsDoes our vector-MLP work across regions?

My expectation is no, but let's give it a go.

Testing fit the expectation. Very hard to get cross-region validation loss below 1.5 .

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


## We need to redefine the dataset, but otherwise the model architecture is the same (with slight input size difference)
DATA_DIR = r'/kaggle/input/the-future-crop-challenge'
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import gc

class VectorisedData(Dataset):
    """
    A pytorch dataset class that transfers all data to GPU, minimising memory transfers during training.
    Expands static data () to the length of the sequence.
    """
    def __init__(self, crop: str, mode: str, data_dir: str, device=DEVICE):
        # Read all data files
        self.prepare_data(crop, mode, data_dir, device)
    
    def prepare_data(self, crop,mode,data_dir,device):
        tasmax = pd.read_parquet(os.path.join(data_dir, f"tasmax_{crop}_{mode}.parquet"))
        tasmin = pd.read_parquet(os.path.join(data_dir, f"tasmin_{crop}_{mode}.parquet"))
        pr = pd.read_parquet(os.path.join(data_dir, f"pr_{crop}_{mode}.parquet"))
        rsds = pd.read_parquet(os.path.join(data_dir, f"rsds_{crop}_{mode}.parquet"))
        soil_co2 = pd.read_parquet(os.path.join(data_dir, f"soil_co2_{crop}_{mode}.parquet"))
        
        # Load yield data if in training mode
        if mode == 'train':
            self.yield_data = pd.read_parquet(os.path.join(data_dir, f"{mode}_solutions_{crop}.parquet"))
            self.yield_data['crop'] = crop
        else:
            self.yield_data = pd.DataFrame(tasmax['crop'])
            self.yield_data['yield'] = 0
            
        # Preprocess climate data in bulk
        climate_data = np.concatenate([
            tasmax.iloc[:, 5:].values,
            tasmin.iloc[:, 5:].values,
            pr.iloc[:, 5:].values,
            rsds.iloc[:, 5:].values
        ], axis=1).astype(np.float32) #(n_samples,240x4)

        
        # Preprocess soil data in bulk
        soil_continuous = soil_co2[['lon', 'lat', 'co2']].values.astype(np.float32) #(n_samples, 3)
        yield_expanded = self.yield_data['yield'].values.reshape(-1,1) #(n_samples, 1)
    
        # Combine climate and soil features (shape: num_samples × 240 × 21)
        full_input = np.concatenate([climate_data, soil_continuous], axis=1) #(n_samples, 240x4+3)
        
        # Move entire dataset to device in one operation
        self.inputs = torch.tensor(full_input, device=device,dtype=torch.float32)
        self.targets = torch.tensor(yield_expanded,device=device,dtype=torch.float32)
        #memory management
        del tasmax, tasmin, pr, rsds, soil_co2, climate_data, soil_continuous, yield_expanded, full_input

        gc.collect()
        
        return None
        
    def __getitem__(self, index):
        # Return precomputed tensors
        return self.inputs[index], self.targets[index]

    def __len__(self):
        return len(self.inputs)

print('Loading training data.,. (this may take a while)')

train_maize = VectorisedData('maize','train',DATA_DIR)
#train_wheat = VectorisedData('wheat','train', DATA_DIR)

print(train_maize.inputs.shape)

print('Finished loading')




Loading training data.,. (this may take a while)
torch.Size([349719, 963])
Finished loading


In [None]:
## just pasting the training loop from above:


# --- 3. Flexible Model Definition and Xavier Initialization (DNN) ---

class FlexibleFlattenedDNN(nn.Module):
    def __init__(self, input_size, output_size, num_layers, reduction_ratio, dropout_rate=0.0, leaky_relu_slope =0.01):
        super(FlexibleFlattenedDNN, self).__init__()
        self.leaky_slope = leaky_relu_slope
        layers = []
        current_size = input_size
        
        # Dynamically build the hidden layers
        for i in range(num_layers):
            # Calculate the size of the next layer
            next_size = max(4, int(current_size * reduction_ratio)) # Min size of 4 for stability
            
            # Add Linear Layer
            layers.append(nn.Linear(current_size, next_size))
            # Add Activation
            layers.append(nn.LeakyReLU(self.leaky_slope))
            
            # Add Dropout (only for intermediate layers)
            if dropout_rate > 0 and i < num_layers - 1:
                 layers.append(nn.Dropout(dropout_rate)) 
                
            current_size = next_size
        
        # Add the final output layer (no activation or dropout after this)
        layers.append(nn.Linear(current_size, output_size))
        
        self.fc_stack = nn.Sequential(*layers)
        
        print(f"DNN Architecture built: {input_size} -> {[l.out_features for l in layers if isinstance(l, nn.Linear)]}")
        
        self._init_weights()

    def _init_weights(self):
        # Xavier/Glorot Initialization for all Linear layers
        for m in self.modules():
            if isinstance(m, nn.Linear):
                #nn.init.xavier_uniform_(m.weight)
                nn.init.kaiming_normal_(m.weight, a= self.leaky_slope)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        print("Model weights initialized with Xavier/Glorot.")

    def forward(self, x):
        return self.fc_stack(x) 


# --- MODEL FLEXIBILITY PARAMETERS ---
NUM_HIDDEN_LAYERS = 8    # The number of layers between input and output (e.g., 3 for 4 layers total)
REDUCTION_RATIO = 1/2      # The ratio by which each layer size decreases (e.g., 0.5 means half the size)
DROPOUT_RATE = 2/5        # Dropout rate for intermediate layers (0.0 for no dropout)

# --- CALCULATED DIMENSIONS ---
FLATTENED_INPUT_SIZE = 963
TARGET_OUTPUT_SIZE = 1 # Corrected to 1D output

# Hyperparameter
init_LR = 1e-4
max_LR = 1e-3
weight_decay = 1e-5 #suggested to be smaller for 'super-convergence' in OneCycleLR paper.
n_epochs = 1000
n_batch = 512


# Create DataLoader
n_train = int(train_maize.inputs.shape[0]*0.8)
n_test = train_maize.inputs.shape[0]-n_train

train_dataset = torch.utils.data.Subset(train_maize, range(n_train))
val_dataset = torch.utils.data.Subset(train_maize,range(n_train,n_train+n_test))

train_loader = DataLoader(train_dataset, batch_size=n_batch, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = n_batch, shuffle = False)

# Instantiate the flexible model
model = FlexibleFlattenedDNN(
    input_size=FLATTENED_INPUT_SIZE,
    output_size=TARGET_OUTPUT_SIZE,
    num_layers=NUM_HIDDEN_LAYERS,
    reduction_ratio=REDUCTION_RATIO,
    dropout_rate=DROPOUT_RATE
).to(DEVICE, dtype=torch.float32)

n_params = sum([p.numel() for p in model.parameters()])
print(f"Model instantiatied with {n_params} parameters.")
# --- 4. Loss and Optimizer ---

# Switched back to MSELoss as requested in the original code block
criterion_primary = nn.MSELoss().to(DEVICE) 
optimizer = optim.AdamW(model.parameters(), lr=init_LR, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_LR, 
                                          steps_per_epoch=len(train_loader), 
                                          epochs=n_epochs)

# --- 5. Training Loop (Unchanged) ---

print("-" * 30)
print(f"Starting Training for {n_epochs} epochs...")
model.train() 

for epoch in range(1, n_epochs + 1):
    for inputs_batch, targets_batch in train_loader:
        # Backward and optimize
        optimizer.zero_grad()
        
        # Forward pass:
        outputs = model(inputs_batch)
        
        # Calculate loss (outputs: [N, 1], targets: [N, 1])
        loss = criterion_primary(outputs, targets_batch)
        
        
        loss.backward()
        optimizer.step()
        scheduler.step()

    if epoch % 10 == 0 or epoch == 1:
        avg_loss = loss 
        print(f'Epoch [{epoch:05d}/{n_epochs}], Loss: {avg_loss:.6f}')
        model.eval()
        with torch.no_grad():
            val_losses = []
            for val_inputs, val_targets in val_loader:
                val_preds = model(val_inputs)
                val_losses.append(criterion_primary(val_preds,val_targets).item())

        print(f'Validation loss: {np.mean(val_losses):.6f}')
print("-" * 30)

DNN Architecture built: 963 -> [481, 240, 120, 60, 30, 15, 7, 4, 1]
Model weights initialized with Xavier/Glorot.
Model instantiatied with 617988 parameters.
------------------------------
Starting Training for 1000 epochs...
Epoch [00001/1000], Loss: 1552.913696
Validation loss: 18.472941
Epoch [00010/1000], Loss: 6.316211
Validation loss: 5.749801
Epoch [00020/1000], Loss: 4.706806
Validation loss: 3.751177
Epoch [00030/1000], Loss: 3.517475
Validation loss: 3.020003
Epoch [00040/1000], Loss: 2.530788
Validation loss: 2.806074
Epoch [00050/1000], Loss: 1.581303
Validation loss: 2.971395
Epoch [00060/1000], Loss: 0.988292
Validation loss: 2.739290
Epoch [00070/1000], Loss: 1.183306
Validation loss: 2.796676
Epoch [00080/1000], Loss: 1.374236
Validation loss: 2.763794
Epoch [00090/1000], Loss: 1.179546
Validation loss: 2.700817
Epoch [00100/1000], Loss: 1.116433
Validation loss: 2.594989
Epoch [00110/1000], Loss: 0.805130
Validation loss: 2.718849
Epoch [00120/1000], Loss: 0.829813
Val