In [22]:
import pandas as pd
import numpy as np
import pickle
import Bio   

from Bio.PDB import *
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import ipywidgets
import plotly.express as px
import plotly.graph_objects as go
from jproperties import Properties
#from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModel, TrainingArguments, Trainer
from motif_utils import seq2kmer # Soruced from https://github.com/jerryji1993/DNABERT
import torch
#from datasets import Dataset
#import evaluate
import json
#from load_data import create_dataset, explode_dna
import wandb

from transformers.onnx import FeaturesManager
import transformers
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import torchvision
from torchvision import datasets, transforms
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import natsort
from PIL import Image
from tqdm.auto import tqdm
import torch.nn as nn
from transformers import AdamW
from torch.utils.data.sampler import SubsetRandomSampler
import pprint
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

In [23]:
torch.cuda.is_available()

False

In [24]:
'''
Ensure all images are the right dimensions for ResNet
'''
transform = transforms.Compose([transforms.Resize(255),
                                transforms.ToTensor()])
#dataset = datasets.ImageFolder('./data/MouseImages/', transform=transform)

In [25]:
'''
Load all data we need
'''
with open("./data/gene_symbol_protein_sequences.pkl", 'rb') as file:
        protein =  pd.read_pickle(file)
with open("./data/capstone_body_weight_Statistical_effect_size_analysis_genotype_early_adult_scaled_13022023_gene_symbol_harmonized.pkl", 'rb') as file:
        effect =  pd.read_pickle(file)

def filter_data(effect, protein):
    '''
    Merge data and reduce dataframe to only nessisary values
    '''
    effect = effect[['gene_symbol_harmonized','est_f_ea','p_f_ea']].copy()
    final = pd.merge(effect, protein, left_on="gene_symbol_harmonized", right_on="gene_symbol_harmonized", how ="left")
    return final [["gene_symbol_harmonized", "UniqueIdentifier", "est_f_ea"]]
df = filter_data(effect, protein)
   
        

In [26]:
class CustomDataSet(Dataset):
    '''
    This creates a custom dataset class for pytorch so that we can use the pytorch API for our dataset.
    Features: 
    X -> Tensor image
    Y -> Weight Change
    Name -> Protien name
    '''
    def __init__(self, main_dir, transform, effect_df):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.x =[]
        self.y =[]
        self.name=[]
        for img in all_imgs:
            accenssion = img.split("-")[1]
            if accenssion in list(effect_df.UniqueIdentifier):
                self.x.append(os.path.join(self.main_dir, img))
                self.name.append(accenssion)
                self.y.append(effect_df[effect_df.UniqueIdentifier == accenssion].est_f_ea.values[0])
        print(len(self.x))
        print(len(self.y))

    def __len__(self):
        return len(self.x)
    
    def __mean__(self):
        return np.mean(self.y)

    def __getitem__(self, idx):
        image = Image.open(self.x[idx]).convert("RGB")
        tensor_image = self.transform(image)
        return {"x":tensor_image, "y": torch.tensor(self.y[idx])}
image_dataset = CustomDataSet(r'./data/MouseImages/', transform=transform, effect_df = df)

5313
5313


In [27]:
class CustomModel(nn.Module):
    '''
    This modifies the original resnet model such that it ends with a linear layer of size 1, 
    and then pass through a tanh to limit the output between -1, 1 which is the range in which
    we normalized the data too. So it can only predict valid estimates
    '''
    def __init__(self, base_model="resnet18", pretrained=False, frozen=False):
        super(CustomModel, self).__init__()
        self.model = torch.hub.load('pytorch/vision:v0.10.0', base_model, pretrained=pretrained)
        num_ftrs = self.model.fc.in_features
        if frozen:
            for param in self.model.parameters():
                param.requires_grad = False
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.model.fc = nn.Linear(num_ftrs,1)
        
    def forward(self, x):
        x = self.tanh(self.model(x))
        return x
 

In [28]:
class epochMetrics():
  '''
  This class keeps track of the relavent metrics for our training
  '''
  def __init__(self, name):
    self.name = name
    self.avg = 0

    self.l1_sum = 0
    self.l1_avg = 0

    self.l2_sum = 0
    self.l2_avg = 0

    self.r2_sum = 0
    self.r2_avg = 0
    self.count = 0
    

  def update(self, outputs, y):
    with torch.no_grad():
      l1 = nn.L1Loss().to("cuda")
      l2 = nn.MSELoss().to("cuda")

      self.count+=1
      self.l1_sum += l1(outputs, y).item()
      self.l1_avg = self.l1_sum/self.count

      self.l2_sum += l2(outputs, y).item()
      self.l2_avg = self.l2_sum/self.count


  
  def log(self):
        wandb.log({f"{self.name} Epoch L1 Loss": self.l1_avg})
        wandb.log({f"{self.name} Epoch L2 Loss": self.l2_avg})
        wandb.log({f"{self.name} Epoch RMSE Loss": np.sqrt(self.l2_avg)})
        #wandb.log({f"{self.name} R2": self.r2_avg })



    

    
    


In [29]:
train_count = 0
val_count = 0
def train(train_loader, model, criterion, optimizer, config):
    '''Standard Training loop for our model '''
    model.train()
    epoch_metrics = epochMetrics("Train")
    predictions = []
  
    with tqdm(train_loader) as _tqdm:
        for batch in _tqdm:
            optimizer.zero_grad()
            x= batch['x'].to(config["device"])
            y = batch['y'].to(config["device"])
            outputs = model(x)
  
            loss = criterion(outputs.squeeze(1), y)
            loss.backward()
            optimizer.step()


            #wandb.log({"Train Actual STD":torch.std(y)})
            #wandb.log({"Train Predicted STD":torch.std(outputs.squeeze(1))})
            #predictions.append(outputs.squeeze(1).item())
            epoch_metrics.update(outputs.squeeze(1), y)
    #wandb.log({"Train Pred STD":torch.std(predictions)})
    #wandb.log({"Train Pred Mean":torch.mean(predictions)})
    epoch_metrics.log()
    
    
            
def val(validate_loader, model, criterion, config):
    '''Same as the training loop except in eval mode so it doesn't update the weights of the model'''
    model.eval()
    epoch_metrics = epochMetrics("Val")

    with torch.no_grad():
        with tqdm(validate_loader) as _tqdm:
            for batch in _tqdm:
                x= batch['x'].to(config.device)
                y = batch['y'].to(config.device)
                outputs = model(x)
                wandb.log({"Val Actual STD":torch.std(y)})
                wandb.log({"Val Predicted STD":torch.std(outputs.squeeze(1))})
                epoch_metrics.update(outputs.squeeze(1), y)

    #wandb.log({"Val Pred STD":torch.std(predictions)})
    #wandb.log({"Val Pred Mean":torch.mean(predictions)})
    epoch_metrics.log()

In [30]:
'''
These are our wandb configs
Use config for a single run
Use sweep_config for multiple runs in a sweep
'''
config = {
    "device":"cuda",
    "learning_rate": .001252,
    "epochs":12,
    "pretrained":False,
    "frozen":False,
    "min_lr":.0003761,
    "warmup_steps":93,
    "loss":"l1",
    "epochs":55,
    "batch_size":9,
    "base_model":"resnet34",

}


sweep_config = {
    'method': 'bayes',
    "parameters": {
        "device":{
            "value":"cuda"
            },
        "learning_rate":{
            "max":1e-2,
            "min":1e-6
            },
        "pretrained":{
            "value": True
            },
        "frozen":{
            "value":False
            },
        "min_lr":{
            "min":0.0,
            "max":1e-3
            },
        "warmup_steps":{
            "min":0,
            "max":100
        },
        "loss":{
            "value":"l1"
            }, #Smooth L1 with beta
        "epochs":{
            "value":50,
        },
        "batch_size":{
            "min":2,
            "max":16,
            },
        "base_model":{
            "values":["resnet18","resnet34","resnet50"]
            },
    },
}

metric = {
    'name': 'Val Epoch L1 Loss',
    'goal': 'minimize'   
}


sweep_config['metric'] = metric

#wandb.init(config = config, project="Protein-Weight", entity="pcoady")
#sweep_id = wandb.sweep(sweep_config, project="Protein-Weight", entity="pcoady")
#sweep_id
#pprint.pprint(sweep_config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: fmzpw0ii
Sweep URL: https://wandb.ai/pcoady/Protein-Weightt/sweeps/fmzpw0ii


In [32]:
def split_data(image_dataset, config = None):
  '''
  This method takes in the entire dataset, splits it into train/val/test sets
  and calculates the baselines for the val set to compare to when running. 
  '''
  train_data,val_data,test_data = torch.utils.data.random_split(image_dataset, [.8, .1, .1 ])

  train_y, val_y,test_y = [],[],[]
  scaler = MinMaxScaler((-1,1))

  for i in tqdm(train_data.indices):
    train_y.append(train_data.dataset.y[i])
  train_y = torch.tensor(train_y)
  scaler.fit(train_y.reshape(-1, 1))



  for i in tqdm(train_data.indices):
    train_data.dataset.y[i] = scaler.transform(np.array(train_data.dataset.y[i]).reshape(1, -1) ).item()
  for i in tqdm(val_data.indices):
    val_data.dataset.y[i] = scaler.transform(np.array(val_data.dataset.y[i]).reshape(1, -1) ).item()
  #for i in tqdm(test_data.indices):
  #  test_data.dataset.y[i] = scaler.transform(np.array(test_data.dataset.y[i]).reshape(1, -1) )



  
  for i in tqdm(val_data.indices):
    val_y.append(val_data.dataset.y[i])
  #for i in tqdm(test_data.indices):
  #  test_y.append(test_data.dataset.y[i])

  val_y = torch.tensor(val_y)

  val_mean = torch.mean(val_y)
  val_means = torch.stack([torch.tensor(val_mean) for n in range(len(val_y))])

  l1 = nn.L1Loss()
  l2 = nn.MSELoss()

  baselines = {"l1_baseline":l1(val_means, val_y),
               "l2_baseline": l2(val_means, val_y),
               "RMSE_baseline": np.sqrt(l2(val_means, val_y)),
               "r2_balseline": r2_score(val_y, val_means)
               }
               

  train_loader = DataLoader(train_data , batch_size=config["batch_size"], shuffle=True)
  val_loader = DataLoader(val_data , batch_size=config["batch_size"], shuffle=True)
  #test_loader = DataLoader(test_data , batch_size=3, shuffle=True)

  return train_loader, val_loader, baselines


In [33]:

def train_loop(config = None):
    '''This is the training loop for either a single run or a sweep run in wandb'''
    
    with wandb.init(project="Protein-Weight", entity="pcoady", config = config):
        #gets the wandb config that wandb pushes for the sweep run
        config = wandb.config
        train_loader, val_loader, baselines = split_data(image_dataset, config)
        model = CustomModel(base_model=config.base_model, pretrained=config.pretrained, frozen=config.frozen).to(config.device)
        
        criterion = nn.L1Loss().to(config.device)
        #Watch command will automatically track metrics in our model
        wandb.watch(model, criterion=criterion)

        optimizer = AdamW(model.parameters(), lr=config.learning_rate)
        #This controls the warmup steps for our learning rate so it changes throughout time
        scheduler = CosineAnnealingLR(optimizer, T_max=config.warmup_steps, eta_min=config.learning_rate*config.min_lr)
        for epoch in range(config.epochs):
            #Main training loop, logging baselines at the end to make graphing easier
            train(train_loader, model, criterion, optimizer, config)
            val(val_loader, model, criterion, config)
            scheduler.step()
            wandb.log({"Epoch Step":epoch})
            wandb.log({"L1 Baseline": baselines["l1_baseline"]})
            wandb.log({"L2 Baseline": baselines["l2_baseline"]})
            wandb.log({"RMSE Baseline": baselines["RMSE_baseline"]})
            wandb.log({"R2 Baseline": baselines["r2_balseline"]})



In [34]:
def run_k_fold(init_config = None):
  '''Similar to train_loop but configured for k_fold validation'''
  splits=KFold(n_splits=10,shuffle=True,random_state=42)
  for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(image_dataset)))):
    print('Fold {}'.format(fold + 1))
    wandb.init(project="Protein-Weight", entity="pcoady", config = init_config)
    config = wandb.config
    scaler = MinMaxScaler((-1,1))
    image_dataset.y = list(scaler.fit_transform(np.array(image_dataset.y).reshape(-1, 1)).reshape(1, -1)[0])
    
    l1 = nn.L1Loss()
    l2 = nn.MSELoss()
    y_mean = np.mean(image_dataset.y)
    dummpy_regressor = torch.tensor([y_mean for i in range(len(image_dataset.y))])
    real_y = torch.tensor(image_dataset.y)
    baselines = {"l1_baseline":l1(dummpy_regressor, real_y),
               "l2_baseline": l2(dummpy_regressor, real_y),
               "RMSE_baseline": np.sqrt(l2(dummpy_regressor, real_y)),
               "r2_balseline": r2_score(real_y, dummpy_regressor)
               }
    
    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(val_idx)
    train_loader = DataLoader(image_dataset, batch_size=config["batch_size"], sampler=train_sampler)
    val_loader = DataLoader(image_dataset, batch_size=config["batch_size"], sampler=test_sampler)
    
    model = CustomModel(base_model=config.base_model, pretrained=config.pretrained, frozen=config.frozen).to(config.device)
    criterion = nn.L1Loss().to(config.device)
    wandb.watch(model, criterion=criterion)
    optimizer = AdamW(model.parameters(), lr=config.learning_rate)
    scheduler = CosineAnnealingLR(optimizer, T_max=config.warmup_steps, eta_min=config.learning_rate*config.min_lr)
    for epoch in range(config.epochs):
        train(train_loader, model, criterion, optimizer, config)
        val(val_loader, model, criterion, config)
        scheduler.step()
        wandb.log({"Epoch Step":epoch})
        wandb.log({"L1 Baseline": baselines["l1_baseline"]})
        wandb.log({"L2 Baseline": baselines["l2_baseline"]})
        wandb.log({"RMSE Baseline": baselines["RMSE_baseline"]})
        wandb.log({"R2 Baseline": baselines["r2_balseline"]})
    wandb.finish()

In [35]:

#run_k_fold(init_config = config) #For k-fold run
#wandb.agent(sweep_id, train_loop, count=5) # For sweep Run
#train_loop(config) # for single run

[34m[1mwandb[0m: Agent Starting Run: kdo16bgf with config:
[34m[1mwandb[0m: 	base_model: resnet18
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	device: cpu
[34m[1mwandb[0m: 	epochs: 75
[34m[1mwandb[0m: 	frozen: False
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: l1
[34m[1mwandb[0m: 	min_lr: 0.1
[34m[1mwandb[0m: 	pretrained: True
[34m[1mwandb[0m: 	warmup_steps: 10
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cache found in C:\Users\zeusg/.cache\torch\hub\pytorch_vision_v0.10.0


  0%|          | 0/266 [00:00<?, ?it/s]

VBox(children=(Label(value='7.290 MB of 10.866 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.67095…

0,1
Epoch Step,▁
Train Actual STD,▁
Train Predicted STD,▁

0,1
Epoch Step,0.0
Train Actual STD,0.52815
Train Predicted STD,0.27113


Run kdo16bgf errored: IndexError('Dimension out of range (expected to be in range of [-1, 0], but got 1)')
[34m[1mwandb[0m: [32m[41mERROR[0m Run kdo16bgf errored: IndexError('Dimension out of range (expected to be in range of [-1, 0], but got 1)')
[34m[1mwandb[0m: Agent Starting Run: 2np7amjf with config:
[34m[1mwandb[0m: 	base_model: resnet34
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	device: cpu
[34m[1mwandb[0m: 	epochs: 75
[34m[1mwandb[0m: 	frozen: False
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: l1
[34m[1mwandb[0m: 	min_lr: 0.1
[34m[1mwandb[0m: 	pretrained: True
[34m[1mwandb[0m: 	warmup_steps: 10
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cache found in C:\Users\zeusg/.cache\torch\hub\pytorch_vision_v0.10.0


  0%|          | 0/266 [00:00<?, ?it/s]

VBox(children=(Label(value='9.743 MB of 10.862 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.89705…

0,1
Epoch Step,▁
Train Actual STD,▁
Train Predicted STD,▁

0,1
Epoch Step,0.0
Train Actual STD,0.43764
Train Predicted STD,0.43541


Run 2np7amjf errored: IndexError('Dimension out of range (expected to be in range of [-1, 0], but got 1)')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 2np7amjf errored: IndexError('Dimension out of range (expected to be in range of [-1, 0], but got 1)')
[34m[1mwandb[0m: Agent Starting Run: mfms6xgk with config:
[34m[1mwandb[0m: 	base_model: resnet110
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	device: cpu
[34m[1mwandb[0m: 	epochs: 75
[34m[1mwandb[0m: 	frozen: False
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: l1
[34m[1mwandb[0m: 	min_lr: 0.1
[34m[1mwandb[0m: 	pretrained: True
[34m[1mwandb[0m: 	warmup_steps: 10
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cache found in C:\Users\zeusg/.cache\torch\hub\pytorch_vision_v0.10.0


Run mfms6xgk errored: RuntimeError('Cannot find callable resnet110 in hubconf')
[34m[1mwandb[0m: [32m[41mERROR[0m Run mfms6xgk errored: RuntimeError('Cannot find callable resnet110 in hubconf')
Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true
