In [None]:
#from hybridpredictmaize22.hybridpredictmaize22.snpCompression import *
from hybridpredictmaize22.GEMlearn import *
from hybridpredictmaize22.GEMdataset import *
from hybridpredictmaize22.snpCompression import *

from pathlib import Path
import os

import allel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm as tqdm
from sklearn.decomposition import PCA



from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim

import fastcore.all as fc
from collections.abc import Mapping
from pathlib import Path
from operator import attrgetter,itemgetter
from functools import partial
from copy import copy
from contextlib import contextmanager
from warnings import warn


In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler


#| export
class newGemDataset():
    """
    Pytorch Dataset which can be used with dataloaders for simple batching during training loops
    """
    def __init__(self,W,Y,G, def_device='cpu'):
        self.W = W
        self.SNP = G
        self.Y = Y
        self.device = def_device
        
    def __len__(self): return self.Y[0].shape[0]

    def __getitem__(self,idx):
      y = self.Y[0][idx]
      e = self.Y[1][idx]
      h = self.Y[2][idx]

      #weather
      w = self.W[1][np.where(self.W[0] == e)[0][0]]

      #snp
      g = snp_data[1][:,np.where(snp_data[0] == h)[0][0]]
      return y,g,w


#| export
class ST():
    """
    A class which will hold the secondary trait data for the entire dataset for pre-training purposes
    
    init
        yield_data -> pandas table
        testYear -> e.g. 2019. this will set all data from a given year as the Test Set
    """
    def __init__(self, yield_data, testYear):

        self.Te = yield_data.iloc[([str(testYear) in x for x in yield_data['Env']]),:].reset_index()
        self.Tr = yield_data.iloc[([str(testYear) not in x for x in yield_data['Env']]),:].reset_index()

        self.secondary_traits = [
               # 'Stand_Count_plants',
               # 'Pollen_DAP_days',
               # 'Silk_DAP_days',
               # 'Plant_Height_cm',
               # 'Ear_Height_cm',
                #'Root_Lodging_plants',
                #'Stalk_Lodging_plants',
               # 'Twt_kg_m3',
                'Yield_Mg_ha',
                #'Date_Harvested'
                ]
        
        self.setup_scaler()
        self.scale_data(self.Tr)
        self.scale_data(self.Te)

        self.make_arrays(self.Tr)
        self.make_arrays(self.Te, False)
    def setup_scaler(self):
        ss = MinMaxScaler()
        ss.fit(np.array(self.Tr[self.secondary_traits]))
        self.scaler = ss

    def scale_data(self,df):
        scaled_secondary = self.scaler.transform(np.array(df[self.secondary_traits]))
        for c,i in enumerate(self.secondary_traits):
            #print(i)
            df[i] = scaled_secondary[:,c]
    
    def plot_yields(self):
        for i in self.secondary_traits:
            plt.hist(self.Tr[i],density=True, label='Train',alpha=.5,bins=50)
            plt.hist(self.Te[i],density=True, label='Test',alpha=.5,bins=50)
            plt.legend()
            plt.title(i)
            plt.show()

    def make_arrays(self,df,train=True):
      df = np.array(df[self.secondary_traits]), np.array(df['Env']) , np.array(df['Hybrid']), np.array(df['Date_Planted'])
      if train:
        self.Tr = df
      else:
        self.Te= df

#| export
class newWT():
    """
    A class which will hold the weather data for the entire dataset for training purposes
    
    init
        weather_data -> pandas table
        testYear -> e.g. 2019. this will set all data from a given year as the Test Set
    """
    def __init__(self, weather_data, testYear):
        
        self.Te = weather_data.iloc[([str(testYear) in x for x in weather_data['Year']]),:].reset_index()
        self.Tr = weather_data.iloc[([str(testYear) not in x for x in weather_data['Year']]),:].reset_index()
            
        self.setup_scaler()
        self.scale_data(self.Tr)
        self.scale_data(self.Te)

        self.make_array(self.Tr)
        self.make_array(self.Te,False)
            
    def setup_scaler(self):
        ss = MinMaxScaler()
        ss.fit(self.Tr.select_dtypes('float'))
        self.scaler = ss
            
    def scale_data(self, df):
        fd = df.select_dtypes('float')
        fs = self.scaler.transform(fd)
        df[fd.columns] = fs

    def make_array(self, df,train = True):
      for c,i in enumerate(set(df['Env'])):
        env_weather = np.array(df[df['Env'] == i].iloc[:,4:-1])
        #print(env_weather.shape)
        if c == 0:
          env_order = list([i])
          weather_array =   np.array(df[df['Env'] == i].iloc[:,4:-1])
          weather_array = np.expand_dims(weather_array,axis=0)
        else:
          weather_array = np.vstack((weather_array, env_weather[None,:,:]))
          env_order.append(i)

        if train:
          self.Tr = (np.array(env_order), np.array(weather_array))
        else:
          self.Te = (np.array(env_order), np.array(weather_array))

In [None]:
test_split = 2019
test_year=2019

path_snps = Path('./data/snpCompress/')
data_path = Path('./data/Training_Data/')
path_train_weatherTable =data_path/'4_Training_Weather_Data_2014_2021.csv'
path_train_yieldTable = data_path/'1_Training_Trait_Data_2014_2021.csv'
snp_compression = 'PCS_100'
batch_size = 64

snp_data = collect_snps(Path('./data/snpCompress/PCS_50/')) # Read in the SNP profiles
yield_data = pd.read_csv(path_train_yieldTable) # Read in trait data 
yield_data = yield_data[yield_data['Yield_Mg_ha'].notnull()] #Remove plots w/ missing yields
weather_data = pd.read_csv(path_train_weatherTable) # Read in Weather Data
weather_data['Year'] = [x.split('_')[1] for x in weather_data['Env']] #Store Year in a new column
#removes yield data where no weather data
setYield = set(yield_data['Env'])
setWeather = set(weather_data['Env'])
only_yield = setYield - setWeather
only_weather = setWeather - setYield
yield_data = yield_data.iloc[[x not in only_yield for x in yield_data['Env']],:]
#removes yield data where no genotype data
setSNP = set(snp_data[0])
setYield = set(yield_data['Hybrid'])
only_yield = setYield - setSNP
yield_data = yield_data.iloc[[x not in only_yield for x in yield_data['Hybrid']],:]



In [None]:
class EfficientNet2(nn.Module):
    def __init__(self, in_chan = 1 , num_classes=100):
        super(EfficientNet2, self).__init__()

        # Define the convolutional layers
        self.conv1 = nn.Conv1d(in_chan, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm1d(32)

        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(64)

        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn3 = nn.BatchNorm1d(128)

        # Define the depthwise separable convolutional layers
        self.dwconv1 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1, groups=128, bias=False)
        self.bn4 = nn.BatchNorm1d(128)

        self.dwconv2 = nn.Conv1d(128, 128, kernel_size=3, stride=2, padding=1, groups=128, bias=False)
        self.bn5 = nn.BatchNorm1d(128)

        # Define the fully connected layer
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        # Pass the input through the convolutional layers
        if x.dim() == 2:
            x = x.unsqueeze(1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)

        # Pass the input through the depthwise separable convolutional layers
        x = self.dwconv1(x)
        x = self.bn4(x)
        x = F.relu(x)

        x = self.dwconv2(x)
        x = self.bn5(x)
        x = F.relu(x)

        # Flatten the feature maps before passing them through the fully connected layer
        # x = x.view(-1, 128)
        # x = self.fc(x)

        return x
    
class EfficientNet3(nn.Module):
    def __init__(self, in_chan = 1 , num_classes=100):
        super(EfficientNet3, self).__init__()

        # Define the convolutional layers
        self.conv1 = nn.Conv1d(in_chan, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm1d(32)

        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(64)

        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn3 = nn.BatchNorm1d(128)

        # Define the depthwise separable convolutional layers
        self.dwconv1 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn4 = nn.BatchNorm1d(128)

        self.dwconv2 = nn.Conv1d(128, 128, kernel_size=3, stride=2, padding=1,  bias=False)
        self.bn5 = nn.BatchNorm1d(128)

        # Define the fully connected layer
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        # Pass the input through the convolutional layers
        if x.dim() == 2:
            x = x.unsqueeze(1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)

        # Pass the input through the depthwise separable convolutional layers
        x = self.dwconv1(x)
        x = self.bn4(x)
        x = F.relu(x)

        x = self.dwconv2(x)
        x = self.bn5(x)
        x = F.relu(x)

        # Flatten the feature maps before passing them through the fully connected layer
        # x = x.view(-1, 128)
        # x = self.fc(x)

        return x

class NNEnsemble3(torch.nn.Module):
  def __init__(self, hidden_list, models_list, alpha=None):
    super().__init__()

    self.models = models_list

    self.layers = nn.ModuleList()
    self.lin1 = torch.nn.Linear(hidden_list[0], hidden_list[1])
    self.lin2 = torch.nn.Linear(hidden_list[1], hidden_list[2])
    self.out = torch.nn.Linear(hidden_list[2], 1)

    
    if alpha != None:
        print('X')
        for c,l in enumerate(self.layers):
            print(l)
            torch.nn.init.xavier_normal_(l.weight,gain=alpha)
    self.out = nn.LazyLinear(1)

  def forward(self,x):
    g,w = x
    g = self.models[0](g)
    w = self.models[1](w)
 #   print(g.shape, w.shape)

    if w.dim() == 3:
      w = w.view(w.shape[0], w.shape[1] * w.shape[2])
    if g.dim() == 3:
      g = g.view(g.shape[0], g.shape[1] * g.shape[2])

    x = torch.concat((g,w),axis=1)
    for c,layer in enumerate(self.layers):
      x = layer(x)
      x = torch.nn.functional.relu(x) 
    return self.out(x)

def moving_average(arr, window_size):
    """Calculate the moving average of an array.
    
    Parameters:
    arr (np.ndarray): Input array with shape (n_samples,).
    window_size (int): Size of the moving window.
    
    Returns:
    np.ndarray: Moving average of the array, with shape (n_samples - window_size + 1,).
    """
    # Initialize a NumPy array to store the moving averages
    ma = np.zeros(len(arr) - window_size + 1)
    
    # Calculate the moving average
    for i in range(len(ma)):
        ma[i] = np.mean(arr[i:i+window_size])
        
    return ma


In [None]:

gm = EfficientNet3(num_classes=1500)
wm = EfficientNet3(300, num_classes=1500)
                    

model = NNEnsemble3([2000,1000,1000], [gm,wm],alpha=2)# plt.hist(detach_list(model((g,w))))

opt = optim.Adamax(model.parameters(), lr= .00005, weight_decay=.0001)
#scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=200, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-06)
loss_func = torch.nn.functional.huber_loss

tr_loss = [] 
te_loss = []
te_MA = []
predicts = []
targets = []

#for i in tqdm(range(len(tr_dataloader))):
for i in tqdm(range(2000)):
  model.train()
  #train loop
  y,g,w = next(iter(tr_dataloader))
  y =  y[:,-1]
  y = y.type(torch.float32).to('cuda')
  g = g.type(torch.float32).to('cuda')
  w = w.type(torch.float32).to('cuda')

  preds = model((g,w))
  preds = preds.squeeze(1)

  loss = loss_func(y, preds)

  loss.backward()
  opt.step()
  opt.zero_grad()
  tr_loss.append(loss.cpu().detach().numpy())




X


  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]


IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/burbank/miniconda3/envs/fastai/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/burbank/miniconda3/envs/fastai/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/burbank/miniconda3/envs/fastai/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_6602/2669084330.py", line 21, in __getitem__
    d = self.Y[3][idx]
IndexError: list index out of range
