Download the 'agri_challenge_data.zip' from the [folder](https://drive.google.com/drive/folders/1CmbccQ7JG3a8IQ_U73806v9iCqLPIlL_?usp=sharing) and upload it to the session

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
#from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import requests
from sklearn.preprocessing import OneHotEncoder
import os
import cv2 as cv
import rasterio
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
f = open('bandwise_minmax.pkl', 'rb')
minmax = pickle.load(f)
f.close()

def makedir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def norm(band, index):
  band_min, band_max = minmax[index]['min'], minmax[index]['max']
  return ((band - band_min)/(band_max - band_min))

def get_image(image_path):
  src = rasterio.open(image_path, 'r')
  S_images=src.read()
  c0 = norm(S_images[0].astype(np.float32), 0)
  c1 = norm(S_images[1].astype(np.float32), 1)
  c2 = norm(S_images[2].astype(np.float32), 2)
  c3 = norm(S_images[3].astype(np.float32), 3)
  c4 = norm(S_images[4].astype(np.float32), 4)
  c5 = norm(S_images[5].astype(np.float32), 5)
  image = np.dstack((c0,c1,c2,c3,c4,c5))
  src.close()
  return image

def process_image(satellite_image):
  satellite_image = satellite_image[1:-1, 1:-1, :]
  h,w,c = satellite_image.shape
  if w < h:
    satellite_image = np.transpose(satellite_image, (1, 0, 2))
  resized_image = cv.resize(satellite_image, (25, 25), interpolation=cv.INTER_CUBIC)
  return resized_image

def get_age(year, location, time_point, planting_date):
  collection_date = doc[str(year)][location][time_point]
  planting_date = pd.to_datetime(planting_date)
  collection_date = pd.to_datetime(collection_date)
  return (collection_date-planting_date).days

In [16]:
train_split = pd.read_csv('data_processed/train_split_from_2023.csv')
val_split = pd.read_csv('data_processed/val_split_from_2023.csv')

In [20]:
f = open('splits/doc_2022.pkl', 'rb')
doc_2022 = pickle.load(f)
f.close()
f = open('splits/doc_2023.pkl', 'rb')
doc_2023 = pickle.load(f)
f.close()
f = open('data_processed/2023_data_split.pkl', 'rb')
data_split = pickle.load(f)
f.close()

In [22]:
doc = {}
doc['2022'] = doc_2022
doc['2023'] = doc_2023
doc['2022']['MOValley'] = doc['2022']['Missouri Valley']

Steps in data preprocessing:


*   Remove Scottsbluff from training set (scottsbluff plot length is different from all other data)
*   Remove rows with nan
*   Remove genotype (2023 val set has two genotypes that are new) and irrigationProvided
*   Divide yield by max yield. Store max yield in metadata
*   Convert location, N_level into one hot encoded form
*   Get the satellite images for each row. Transpose image if width is shorter than height. Reside to 25 by 25. And min max normalize each channel of the image.
*   For each satellite image, Calculate age by finding difference between planting date and time point. Divide age by 365.
*   Make a list of data samples and another list of yields
*   Divide the list into training vs validation
*   Save as numpy array or pickle file



In [25]:
all_data = pd.concat([train_split, val_split])

In [27]:
all_data.head()

Unnamed: 0,year,location,irrigationProvided,experiment,range,row,N_level,genotype,planting_date,yield
0,2023,Lincoln,0,150,5,20,Medium,C.I. 540 X I159,5/16/23,22.62
1,2023,Lincoln,0,225,31,18,High,B73 X PHM49,5/16/23,102.38
2,2023,Lincoln,0,150,5,12,Medium,B73 X PHN82,5/16/23,119.66
3,2023,MOValley,0,2125,14,11,Medium,LH198 X PHB47,5/2/23,182.09
4,2023,MOValley,0,2125,28,20,Medium,CI 3A X I159,5/2/23,67.4


In [29]:
metadata = {}

In [31]:
all_data = all_data[all_data.location != 'Scottsbluff']
all_data = all_data.drop(columns=['genotype','irrigationProvided'])
metadata['max_yield'] = all_data['yield'].max()
all_data['yield'] = all_data['yield'] / all_data['yield'].max()

In [33]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data = encoder.fit_transform(all_data[['N_level']])
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data, columns=encoder.get_feature_names_out(['N_level']))
all_data = all_data.reset_index(drop=True)
encoded_df = encoded_df.reset_index(drop=True)
all_data_encoded = pd.concat([all_data, encoded_df], axis=1)

In [35]:
all_data_encoded = all_data_encoded.drop(['N_level'], axis=1)
all_data_encoded.head()

Unnamed: 0,year,location,experiment,range,row,planting_date,yield,N_level_High,N_level_Low,N_level_Medium
0,2023,Lincoln,150,5,20,5/16/23,0.081232,0.0,0,1.0
1,2023,Lincoln,225,31,18,5/16/23,0.367665,1.0,0,0.0
2,2023,Lincoln,150,5,12,5/16/23,0.429721,0.0,0,1.0
3,2023,MOValley,2125,14,11,5/2/23,0.653918,0.0,0,1.0
4,2023,MOValley,2125,28,20,5/2/23,0.242046,0.0,0,1.0


In [37]:
train_df, val_df = train_test_split(all_data_encoded, test_size=0.2, random_state=42)

In [39]:
train_images = []
train_numerical_features = []
train_targets = []

for index, row in tqdm(train_df.iterrows()):

  location = row['location']
  year = row['year']
  if year == 2023:
    year = '2023_train'
  experiment = row['experiment']
  if experiment == 'Hyrbrids':
    experiment = 'Hybrids'
  range = row['range']
  row_ = row['row']
  planting_date = row['planting_date']
  time_points = sorted(os.listdir(os.path.join('satellite_images', str(year), location)))

  for time_point in time_points:
    folder_path = os.path.join('satellite_images', str(year), location, time_point)
    image_name = f'{location}-{time_point}-{experiment}_{range}_{row_}.TIF'
    image_path = os.path.join(folder_path, image_name)
    if os.path.exists(image_path):
      satellite_image = get_image(image_path)
      processed_image = process_image(satellite_image)
      age = get_age(row['year'], location, time_point, planting_date)
      age = age/365
      train_images.append(processed_image)
      train_numerical_features.append([row['N_level_High'],
                     row['N_level_Low'],
                     row['N_level_Medium'],
                     age])
      train_targets.append(row['yield'])
    else:
      print(image_path)

train_images = np.array(train_images)
train_numerical_features = np.array(train_numerical_features)
train_targets = np.array(train_targets)

514it [00:10, 48.54it/s]


In [41]:
val_images = []
val_numerical_features = []
val_targets = []

for index, row in tqdm(val_df.iterrows()):

  location = row['location']
  year = row['year']
  if year == 2023:
    year = '2023_train'
  experiment = row['experiment']
  if experiment == 'Hyrbrids':
    experiment = 'Hybrids'
  range = row['range']
  row_ = row['row']
  planting_date = row['planting_date']
  time_points = sorted(os.listdir(os.path.join('satellite_images', str(year), location)))

  for time_point in time_points:
    folder_path = os.path.join('satellite_images', str(year), location, time_point)
    image_name = f'{location}-{time_point}-{experiment}_{range}_{row_}.TIF'
    image_path = os.path.join(folder_path, image_name)
    if os.path.exists(image_path):
      satellite_image = get_image(image_path)
      processed_image = process_image(satellite_image)
      age = get_age(row['year'], location, time_point, planting_date)
      age = age/365
      val_images.append(processed_image)
      val_numerical_features.append([row['N_level_High'],
                     row['N_level_Low'],
                     row['N_level_Medium'],
                     age])
      val_targets.append(row['yield'])
    else:
      print(image_path)

val_images = np.array(val_images)
val_numerical_features = np.array(val_numerical_features)
val_targets = np.array(val_targets)

129it [00:02, 47.79it/s]


In [43]:
data_dict = {'train_images': train_images,
             'train_numerical_features': train_numerical_features,
             'train_targets': train_targets,
             'val_images': val_images,
             'val_numerical_features': val_numerical_features,
             'val_targets': val_targets,
             'metadata': metadata}

In [45]:
f = open('processed_data_minmax_normalized_trimmed_2023.pkl', 'wb')
pickle.dump(data_dict, f)
f.close()