In [None]:
#!pip install biopython
#!pip install jproperties
#!pip install transformers
#!pip install datasets
#!pip install evaluate
#!pip install wandb
#!pip install --upgrade pandas

In [1]:
import pandas as pd
import numpy as np
import pickle
import Bio   
import plotly.express as px
import plotly.graph_objects as go
from jproperties import Properties
#import wandb
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModel, TrainingArguments, Trainer, AutoConfig, DataCollatorWithPadding
from transformers import AdamW,get_scheduler
from transformers.modeling_outputs import SequenceClassifierOutput
from motif_utils import seq2kmer # Soruced from https://github.com/jerryji1993/DNABERT
import torch
from datasets import Dataset
import evaluate
import json
from load_data import create_dataset, explode_dna
import wandb
from tqdm.auto import tqdm
from transformers.onnx import FeaturesManager
import transformers
from pathlib import Path
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score
import pprint
#from google.colab import drive
#!pip install cloud-tpu-client==0.10 torch==1.13.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.13-cp38-cp38-linux_x86_64.whl

OSError: [WinError 127] The specified procedure could not be found. Error loading "c:\Users\zeusg\Envs\real-fast\lib\site-packages\torch\lib\caffe2_detectron_ops_gpu.dll" or one of its dependencies.

In [None]:
'''
1. Normalize dataset
2. Try with smaller networks?
3. Try with very few training examples to try and get overfitting
4. Expand model size
5. Hyper Param Search https://huggingface.co/blog/ray-tune

'''

In [None]:
#train, val, test = create_dataset()

In [None]:
colab=False
if colab:
    drive.mount('/content/gdrive/', force_remount=True)
    %cd gdrive/MyDrive/milestone_data/
    with open("capstone_body_weight_Statistical_effect_size_analysis_genotype_early_adult_scaled_13022023_gene_symbol_harmonized.pkl", 'rb') as file:
        effect =  pickle.load(file)
        
        
    with open("gene_symbol_dna_sequence_exon.pkl", 'rb') as file:
            exon =  pickle.load(file)
def seq2kmer(seq, k):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """ 
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers    

In [None]:
with open("./data/gene_symbol_dna_sequence.pkl", 'rb') as file:
        gene =  pickle.load(file)
        
with open("./data/capstone_body_weight_Statistical_effect_size_analysis_genotype_early_adult_scaled_13022023_gene_symbol_harmonized.pkl", 'rb') as file:
        effect =  pickle.load(file)
        
        
with open("./data/gene_symbol_dna_sequence_exon.pkl", 'rb') as file:
        exon =  pickle.load(file)
        

In [None]:

fig = go.Figure()
fig.add_trace(go.Histogram(x = exon.Sequence.str.len()))
fig.add_trace(go.Histogram(x = gene.Sequence.str.len()))
fig.show()

In [None]:
def get_longest_sequence(seq_list):
    return max(seq_list, key =len)

def filter_data(effect, exon, min_seq_len = 0, max_seq_len = 512, longest_only = False):
    '''
    Filter the Data down to include only sequences within a certian size range
    Optionally includ only the genes with the longest sequences
    '''
    effect = effect[['gene_symbol','est_f_ea','p_f_ea']].copy()
    
    trimmed = exon[(exon.Sequence.str.len()>min_seq_len) & (exon.Sequence.str.len() <= max_seq_len)].copy()
    
    if longest_only:
        trimmed = trimmed.groupby(by=["Gene name"])["Sequence"].apply(list)
        trimmed = trimmed.apply(get_longest_sequence)
        trimmed = pd.DataFrame({"Gene name": trimmed.index, 'Sequence': trimmed.values})

    final = pd.merge(effect, trimmed, left_on="gene_symbol", right_on="Gene name")
    return final[["Gene name", "est_f_ea", "Sequence"]]
df = filter_data(effect, exon, 75, 512, True)
    
    

In [None]:
df

In [None]:
def data_prep(df, size = None):
    '''
    Preps the data for the ML pipeline
    1. Renames the columns to whats required by Hugging face
    2. Reduces the number of columns to just what's needed
    3. Randomizes the order of the data
    4. Splits into Train/val/test sets
    5. Scales the data to fit between -1,1 (Which is the range of Tanh)
    '''
    if size:
        df = df.copy().sample(frac=1)[:size]
    
    
    df_len = len(df)
    df = df.rename({"Sequence":"dna_seq", "est_f_ea":"label"}, axis=1)
    df = df[["dna_seq", "label"]]
    df.dna_seq = df.dna_seq.astype(str)
    df.dna_seq = df.dna_seq.apply(lambda x: seq2kmer(x, 6))
    df = df.sample(frac=1)
    train = df[:int(np.round(df_len*.8))].copy()
    val = df[int(np.round(df_len*.8)):int(np.round(df_len*.9))].copy()
    test = df[int(np.round(df_len*.9)):].copy()

    scaler = MinMaxScaler((-1,1))
    scaler.fit(train["label"].values.reshape(-1, 1))
    train["label"] = scaler.transform(train["label"].values.reshape(-1, 1))
    val["label"] = scaler.transform(val["label"].values.reshape(-1, 1))
    test["label"] = scaler.transform(test["label"].values.reshape(-1, 1))
    
    return train, val, test
    
train, val, test = data_prep(df,100)

In [None]:
'''Confirm the distributions of the 3 sets is similar'''
def show_dist():
    fig = go.Figure()
    fig.add_trace(go.Histogram(x = val["label"], histnorm='probability', name = "val"))
    fig.add_trace(go.Histogram(x = test["label"], histnorm='probability', name = "test"))
    fig.add_trace(go.Histogram(x = train["label"], histnorm='probability', name = "train"))
    
    fig.update_layout(
        barmode="overlay",
        bargap=0.1)
    fig.show()
show_dist()

In [None]:
train_mean = train.label.mean()
val_mean = val.label.mean()
test_mean = test.label.mean()

print(f"Train: {test_mean}, Val: {val_mean}, Test: {test_mean}")

In [None]:
'''Tokenize the data set, and put them into dataloaders'''


tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNA_bert_6')
def tokenize_function(df):
    return tokenizer(df["dna_seq"], padding=True, truncation=True, max_length=512)#512


train = Dataset.from_pandas(train).map(tokenize_function, batched=True)
val = Dataset.from_pandas(val).map(tokenize_function, batched=True)
test = Dataset.from_pandas(test).map(tokenize_function, batched=True)

train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




In [None]:
class CustomModel(nn.Module):
    def __init__(self, checkpoint):
        super(CustomModel, self).__init__()
        self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        for param in self.model.parameters():
            param.requires_grad = False
        
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768,384)
        self.fc2 = nn.Linear(384, 192)
        self.fc3 = nn.Linear(192,8)
        self.output = nn.Linear(8,1)
        self.loss_fct = nn.MSELoss()

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        original_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = self.relu(self.dropout(self.fc1(original_output[0])))
        outputs = self.relu(self.dropout(self.fc2(outputs[:,0,:])))
        outputs = self.relu(self.dropout(self.fc3(outputs)))
        logits  = self.tanh(self.output(outputs))
        
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.squeeze(1), labels)
        wandb.log({'True Values': labels })
        wandb.log({'Predicted Values': logits.squeeze(1)})
            
        
        return SequenceClassifierOutput(loss = loss, logits=logits, hidden_states=original_output.hidden_states, attentions=original_output.attentions)
        
        
        
        
        

In [None]:
sweep_config = {
   'method': 'grid',
   'parameters': {
        'learning_rate':{
                'values':[1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
        }
   }
}



In [None]:
'''
Training Hyper Parameters
'''
num_epochs = 25
learning_rate = 5e-5
custom_head = True


#config = {
#        'learning_rate':1e-2,
#        "warmup_steps":0,
#        "epochs": num_epochs,
#        "batch_size": batch_size,
#        "custom_head": custom_head,
#        "train_size": len(train),
#        "device":"cuda",
#}
sweep_config = {
   'method': 'grid',
   'parameters': {
        'learning_rate':{
                'values':[1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
        },
        "warmup_steps":{
                'value':0},
        "epochs": {
                'value':num_epochs},
        "batch_size": {
                'value':[4,6,8,10,20]},
        "custom_head": {
                'value':custom_head},
        "train_size": {
                'value':len(train)},
        "device":{
                'value':"cuda"},
   }
}
metric = {
    'name': 'Validation Epoch Loss',
    'goal': 'minimize'   
    }


sweep_config['metric'] = metric

sweep_id = wandb.sweep(sweep_config, project="DNA-Weight", entity="pcoady")













In [None]:
pprint.pprint(sweep_config)

In [None]:
def train_model(config = None):
    with wandb.init(project="DNA-Weight", entity="pcoady",tags = ["custom_head"], config = config):
        
        config = wandb.config
        
        train_dataloader = DataLoader(train, shuffle=True, batch_size=config.batch_size, collate_fn=data_collator)
        val_dataloader = DataLoader(val, shuffle=True, batch_size=config.batch_size, collate_fn=data_collator)
        
        model=CustomModel(checkpoint='zhihan1996/DNA_bert_6').to(config.device)
        optimizer = AdamW(model.parameters(), lr=config.learning_rate)
        num_training_steps = num_epochs * len(train)
        loss_fct = nn.MSELoss()
        
        
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=config.warmup_steps,
            num_training_steps=num_training_steps,
        )

        
        progress_bar_train = tqdm(range(num_training_steps))
        progress_bar_eval = tqdm(range(num_epochs * len(val)))


        dummy_labels = torch.tensor([np.array(train.data.__getitem__(1)).mean() for x in range(train.data.num_rows)])
        train_baseline = loss_fct(dummy_labels, torch.tensor(np.array(train.data.__getitem__(1))))
        val_baseline = loss_fct(dummy_labels[:val.data.num_rows], torch.tensor(np.array(val.data.__getitem__(1))))
        val_step = 0    
        epoch_count = 0
        wandb.watch(model, log_freq=100)
        
        for epoch in range(num_epochs):
            running_train_loss = 0.0
            running_val_loss = 0.0

            model.train()
            train_pred = np.array([])
            train_real = np.array([])
            for batch in train_dataloader: 
                batch = {k: v.to("cuda") for k, v in batch.items()}
                outputs = model(**batch)        
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                wandb.log({f'Train Baseline':train_baseline, f'Train Batch Loss': outputs.loss})
                running_train_loss+=float(outputs.loss)
                progress_bar_train.update(1)

                #train_pred = np.append(train_pred, outputs.logits.detach().cpu().numpy().flatten())
                #train_real = np.append(train_real, batch["labels"].cpu().numpy())

            model.eval()
            val_pred = np.array([])
            val_real = np.array([])
            for batch in val_dataloader:
                val_step +=1
                progress_bar_eval.update(1)
                batch = {k: v.to("cuda") for k, v in batch.items()}
                with torch.no_grad():
                    outputs = model(**batch)
                #val_pred = np.append(val_pred, outputs.logits.detach().cpu().numpy().flatten())
                #val_real = np.append(val_real, batch["labels"].cpu().numpy())
                wandb.log({f'Validation Baseline':val_baseline, f'Validation Batch Loss':  outputs.loss, "Val Step": val_step})
                running_val_loss+=float(outputs.loss)

            wandb.log({f"Train Epoch Loss":running_train_loss/len(train)})
            wandb.log({f"Validation Epoch Loss":running_val_loss/len(val)})
            wandb.log({f'Train R2': r2_score(train_pred, train_real)})
            wandb.log({f'Val R2': r2_score(val_pred, val_real), 'Epoch Count': epoch_count})

            epoch_count +=1

if custom_head:
    wandb.agent(sweep_id, train_model, count=5)
        
        
    

In [None]:
mse_metric = evaluate.load("mse")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    #labels = labels.reshape(-1, 1)
    baseline = np.array([val_mean for x in range(len(labels))])
    new_logits = logits.reshape(1,-1)[0]
    
    wandb.log({'Base Line': mse_metric.compute(predictions=baseline, references=labels)})
    wandb.log({'True Values': labels })
    wandb.log({'Predicted Values': new_logits})

    mse = mse_metric.compute(predictions=new_logits, references=labels)
    return mse

In [None]:
if custom_head == False:

    model = AutoModelForSequenceClassification.from_pretrained('zhihan1996/DNA_bert_6',
                                                               num_labels=1, 
                                                               ignore_mismatched_sizes=True).to("cuda")

    training_args = TrainingArguments(output_dir='weight_model', 
                                      evaluation_strategy='epoch',
                                      per_device_train_batch_size = 5,  
                                      num_train_epochs=1000)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=val,#CHANGED
        compute_metrics=compute_metrics
    )
    trainer.train()

In [None]:
if custom_head == False
    results = trainer.predict(train)
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=results[1], histnorm='probability', name = "actual"))
    fig.add_trace(go.Histogram(x=results[0].reshape(-1, 1)[:,0], histnorm='probability', name = "predictions"))
    
    fig.update_layout(
        barmode="overlay",
        bargap=0.1)
    
    
    fig.show()

In [None]:
def visualize_model():
        feature = "sequence-classification"
        model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=feature)
        onnx_config = model_onnx_config(model.config)
        onnx_inputs, onnx_outputs = transformers.onnx.export(
                preprocessor=tokenizer,
                model=model,
                config=onnx_config,
                opset=13,
                output=Path("pretrained-model.onnx"))