<a href="https://colab.research.google.com/github/conniaren/GenotypeImputationProject/blob/master/2-Models/autoencoder_model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq wandb pytorch-lightning requests

In [None]:
import scipy
from scipy import sparse 
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision import transforms
from torchvision.transforms import Compose
from torch import nn, optim 
import wandb
import pytorch_lightning as pl
import torch.nn.functional as F
import io
import requests
from sklearn.model_selection import KFold

In [None]:
r = requests.get("https://drive.google.com/uc?export=download&id=1UraCevZUlKeCHOtYd5PeMX2-hwNVb93O")
buf = io.BytesIO(r.content)
#wandb.login()

In [None]:
class autoencoder_model_1 (pl.LightningModule):
  def __init__(self, input_dim, n_hidden, lr = 1e-3, combine = False):
    super().__init__()
    self.encoder = nn.Sequential(nn.Linear(input_dim, n_hidden), nn.ReLU())
    self.decoder = nn.Sequential(nn.Linear(n_hidden, input_dim), nn.ReLU(), 
                                 nn.Linear(input_dim, 3*input_dim), 
                                 ReshapeLogSoftmax(n_snps = input_dim, combine = combine))
    self.double()
    self.save_hyperparameters()
    self.learning_rate = lr
    self.weights = None

  def forward (self, features):
    reconstruction = self.encoder(features)
    reconstruction = self.decoder(reconstruction)
    return reconstruction
  
  def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x = batch
        x = x[0].view(x[0].size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.nll_loss(x_hat, x.to(int), weight = self.weights)
        #loss = F.mse_loss(x_hat, x)
        # Logging to TensorBoard by default
        self.log("train_loss", loss, on_epoch = True)
        wandb.log({ "loss": loss})
        return loss
  
  def configure_optimizers(self):   
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

  '''
  def train_dataloader(self):
    genotype_dataset = TensorDataset(torch.tensor(dataset, dtype=torch.float64))
    dataloader = DataLoader(genotype_dataset,batch_size=10)
    return dataloader
  '''

class ReshapeLogSoftmax(nn.Module):
    def __init__(self, n_snps, combine = False):
        super().__init__()
        self.combine = combine
        self.n_snps = n_snps
        
    def forward(self, x):
        if self.combine: 
          x1 = x[:,0:x.shape(1)/2-1].view(-1,3,self.n_snps/2)
          x2 = x[:,x.shape(1)/2-1:x.shape(1)].view(-1,3,self.n_snps/2)
          x12 = torch.concat((x1,x2), dim = 1)
          combined_softmax = F.log_softmax(x12, dim = 1)
          combined_softmax.reshape(-1,2,3,self.n_snps/2).sum(axis=1)
          return combined_softmax
        x = x.view(-1, 3, self.n_snps)
        return F.log_softmax(x, dim=1)

In [None]:
#Configurations 
epochs = 50

print("-----------------------------------")

dataset = sparse.load_npz(buf).todense()
dataset = dataset[:,200000:201000]


print("Training Process Begin")
print("-----------------------------------")

genotype_dataset = TensorDataset(torch.tensor(dataset, dtype=torch.float64))

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
model = autoencoder_model_1(1000, 16)
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
    auto_lr_find = True     # Find the learning rate
)
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "CV Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":1}
group_name = "child_bin_group_1000"
name=group_name+'_seed_'+str(np.random.randint(100000000))
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(model, train_loader)
wandb.finish()
print('Training process has finished. Saving trained model.')
print('Starting testing')

#Save model 
path = f"./model-1.pth"
torch.save(model.state_dict(), path)

-----------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Training Process Begin
-----------------------------------


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 16.0 K
1 | decoder | Sequential | 3.0 M 
---------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.144    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▅▃▄▃▂▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▃▄▃▂▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
epoch,49.0
loss,0.00429
train_loss_epoch,0.01547
train_loss_step,0.00429
trainer/global_step,2549.0


Training process has finished. Saving trained model.
Starting testing


In [None]:
#Configurations 
epochs = 50

print("-----------------------------------")

dataset = sparse.load_npz(buf).todense()
dataset = dataset[:,201000:202000]


print("Training Process Begin")
print("-----------------------------------")

genotype_dataset = TensorDataset(torch.tensor(dataset, dtype=torch.float64))

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
model = autoencoder_model_1(1000, 16)
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
    auto_lr_find = True     # Find the learning rate
)
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "CV Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":1}
group_name = "child_bin_group_1000"
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(model, train_loader)
wandb.finish()
print('Training process has finished. Saving trained model.')
print('Starting testing')

#Save model 
path = f"./model-2.pth"
torch.save(model.state_dict(), path)

-----------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
[34m[1mwandb[0m: Currently logged in as: [33mconnia[0m (use `wandb login --relogin` to force relogin)


Training Process Begin
-----------------------------------


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 16.0 K
1 | decoder | Sequential | 3.0 M 
---------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.144    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Training: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▆▃▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▃▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
epoch,49.0
loss,0.00189
train_loss_epoch,0.01069
train_loss_step,0.00189
trainer/global_step,2549.0


Training process has finished. Saving trained model.
Starting testing


In [None]:
#Configurations 
epochs = 50

print("-----------------------------------")

dataset = sparse.load_npz(buf).todense()
dataset = dataset[:,202000:203000]


print("Training Process Begin")
print("-----------------------------------")

genotype_dataset = TensorDataset(torch.tensor(dataset, dtype=torch.float64))

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
model = autoencoder_model_1(1000, 16)
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
    auto_lr_find = True     # Find the learning rate
)
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "CV Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":1}
group_name = "child_bin_group_1000"
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(model, train_loader)
wandb.finish()
print('Training process has finished. Saving trained model.')
print('Starting testing')

#Save model 
path = f"./model-3.pth"
torch.save(model.state_dict(), path)

-----------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Training Process Begin
-----------------------------------


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 16.0 K
1 | decoder | Sequential | 3.0 M 
---------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.144    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Training: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
epoch,49.0
loss,0.00565
train_loss_epoch,0.01781
train_loss_step,0.00565
trainer/global_step,2549.0


Training process has finished. Saving trained model.
Starting testing


In [None]:
class ParentModel(pl.LightningModule):
    def __init__(self, modelA, modelB, modelC):
        super(ParentModel, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelC = modelC
        self.double()
        self.weights = None
        
    def forward(self, x1, x2):
        x1 = self.modelA(x1)
        x2 = self.modelB(x2)
        x = torch.cat((x1, x2), dim=1)
        x = self.modelC(x)
        return x
    def training_step(self, batch, batch_idx):
      # training_step defined the train loop.
      # It is independent of forward
      x = batch
      x1 = x[0].view(x[0].size(0), -1)
      x2 = x[1].view(x[1].size(0),-1)
      z1 = self.modelA(x1)
      z2 = self.modelB(x2)
      z = torch.cat((z1, z2), dim=1)
      x_hat = self.modelC(z)
      loss = F.nll_loss(x_hat, torch.concat((x1,x2),dim=1).to(int), weight = self.weights)
      accuracy = (torch.concat((x1,x2),dim=1) == x_hat.argmax(dim=1)).to(float).mean(dim=1).mean()
      #loss = F.mse_loss(x_hat, x)
      # Logging to TensorBoard by default
      self.log("train_loss", loss, on_epoch = True)
      self.log("accuracy",accuracy, on_epoch = True)
      wandb.log({"loss": loss,"accuracy":accuracy})
      return loss
    def configure_optimizers(self):   
      return torch.optim.Adam(self.parameters(), lr=1e-3)
    
  
modela = autoencoder_model_1(1000, 16)
modela.load_state_dict(torch.load(f"./model-1.pth"))
#modela.eval()
modelb = autoencoder_model_1(1000, 16)
modelb.load_state_dict(torch.load(f"./model-2.pth"))
#modelb.eval()
modelc = autoencoder_model_1(2000, 16)
extra_layer = nn.Sequential(nn.Linear(32,16), nn.ReLU())
modelc = nn.Sequential(*(list(extra_layer.children())+ list(modelc.decoder.children())))
print(modelc)

parent = ParentModel(modela.encoder, modelb.encoder, modelc)
dataset = sparse.load_npz(buf).todense()
dataset1 = dataset[:,200000:201000]
dataset2 = dataset[:,201000:202000]
genotype_dataset = TensorDataset(torch.tensor(dataset1, dtype=torch.float64), torch.tensor(dataset2,dtype=torch.float64))
print(genotype_dataset[:][1].shape)

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
  )
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "layer_2": 32,
    "activation_output": "log-softmax",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "CV Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":3}
group_name = "parent_block_group"
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(parent, train_loader)
wandb.finish()
#Save model 
path = f"./model-1-2.pth"
torch.save(parent.state_dict(), path)

Sequential(
  (0): Linear(in_features=32, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=2000, bias=True)
  (3): ReLU()
  (4): Linear(in_features=2000, out_features=6000, bias=True)
  (5): ReshapeLogSoftmax()
)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


torch.Size([503, 1000])


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | modelA | Sequential | 16.0 K
1 | modelB | Sequential | 16.0 K
2 | modelC | Sequential | 12.0 M
--------------------------------------
12.1 M    Trainable params
0         Non-trainable params
12.1 M    Total params
48.290    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▅▅▆▇▇▇▇▇▇▇▇███████████████████████████
accuracy_epoch,▁▃▄▅▆▆▇▇▇▇▇▇▇███████████████████████████
accuracy_step,▁▄▅▅▆▇▇▇▇▇▇▇▇███████████████████████████
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▅▄▄▃▃▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▆▅▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▄▄▃▃▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
accuracy,0.99983
accuracy_epoch,0.99979
accuracy_step,0.99983
epoch,49.0
loss,0.00071
train_loss_epoch,0.00149
train_loss_step,0.00071
trainer/global_step,2549.0


In [None]:
modela = autoencoder_model_1(1000, 16)
modela.load_state_dict(torch.load(f"./model-2.pth"))
#modela.eval()
modelb = autoencoder_model_1(1000, 16)
modelb.load_state_dict(torch.load(f"./model-3.pth"))
#modelb.eval()
modelc = autoencoder_model_1(2000, 16)
extra_layer = nn.Sequential(nn.Linear(32,16), nn.ReLU())
modelc = nn.Sequential(*(list(extra_layer.children())+ list(modelc.decoder.children())))
print(modelc)

parent = ParentModel(modela.encoder, modelb.encoder, modelc)
dataset = sparse.load_npz(buf).todense()
dataset1 = dataset[:,201000:202000]
dataset2 = dataset[:,202000:203000]
genotype_dataset = TensorDataset(torch.tensor(dataset1, dtype=torch.float64), torch.tensor(dataset2,dtype=torch.float64))
print(genotype_dataset[:][1].shape)

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
  )
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "layer_2": 32,
    "activation_output": "log-softmax",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "CV Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":3}
group_name = "parent_block_group"
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(parent, train_loader)
wandb.finish()
#Save model 
path = f"./model-2-3.pth"
torch.save(parent.state_dict(), path)

Sequential(
  (0): Linear(in_features=32, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=2000, bias=True)
  (3): ReLU()
  (4): Linear(in_features=2000, out_features=6000, bias=True)
  (5): ReshapeLogSoftmax()
)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


torch.Size([503, 1000])


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | modelA | Sequential | 16.0 K
1 | modelB | Sequential | 16.0 K
2 | modelC | Sequential | 12.0 M
--------------------------------------
12.1 M    Trainable params
0         Non-trainable params
12.1 M    Total params
48.290    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Training: 0it [00:00, ?it/s]

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▅▆▇▇▇▇█▇█▇█████████████████████████████
accuracy_epoch,▁▄▅▆▇▇▇▇▇▇▇█████████████████████████████
accuracy_step,▁▅▆▇▇▇▇█▇█▇█████████████████████████████
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▅▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
accuracy,0.999
accuracy_epoch,0.99936
accuracy_step,0.999
epoch,49.0
loss,0.00328
train_loss_epoch,0.0027
train_loss_step,0.00328
trainer/global_step,2549.0


# New Section

Root Model Prototype and Training


In [None]:
class RootModel(pl.LightningModule):
    def __init__(self, modelC1, modelC2, modelD ):
        super(RootModel, self).__init__()
        self.modelC1 = modelC1
        self.modelD = modelD
        self.modelC2 = modelC2
        self.double()
        self.weights = None
        
    def forward(self, x1, x2, x3):
        x1 = self.modelC1(x1)
        x2 = self.modelC2(x2)
        x12 = torch.cat((x1, x2), dim=1)
        x = self.modelD(x12)
        return x
    def training_step(self, batch, batch_idx):
      # training_step defined the train loop.
      # It is independent of forward
      x = batch
      x1 = x[0].view(x[0].size(0), -1)
      x2 = x[1].view(x[1].size(0),-1)
      z1 = self.modelC1(x1)
      z2 = self.modelC2(x2)
      z_12 = torch.cat((z1,z2), dim =1)
      x_hat = self.modelD(z_12)
      loss = F.nll_loss(x_hat, torch.concat((x1,x2),dim=1).to(int), weight = self.weights)
      accuracy = (torch.concat((x1,x2),dim=1) == x_hat.argmax(dim=1)).to(float).mean(dim=1).mean()
      #loss = F.mse_loss(x_hat, x)
      # Logging to TensorBoard by default
      self.log("train_loss", loss, on_epoch = True)
      self.log("accuracy",accuracy, on_epoch = True)
      wandb.log({"loss": loss,"accuracy":accuracy})
      return loss
    def configure_optimizers(self):   
      return torch.optim.Adam(self.parameters(), lr=1e-3)
  
modela = autoencoder_model_1(1000, 16)
modelb = autoencoder_model_1(1000, 16)
modelc = autoencoder_model_1(2000, 16)
extra_layer = nn.Sequential(nn.Linear(32,16), nn.ReLU())
modelc = nn.Sequential(*(list(extra_layer.children())+ list(modelc.decoder.children())))
parent1 = ParentModel(modela.encoder, modelb.encoder, modelc)
parent2 = ParentModel(modela.encoder, modelb.encoder, modelc)
parent1.load_state_dict(torch.load(f"./model-1-2.pth"))
parent2.load_state_dict(torch.load(f"./model-2-3.pth"))
print(parent)

modeld = autoencoder_model_1(4000, 16, combine = True)
extra_layer = nn.Sequential(nn.Linear(32,16), nn.ReLU())
modeld = nn.Sequential(*(list(extra_layer.children())+ list(modeld.decoder.children())))
print(modeld)

parent1 = nn.Sequential(*(list(parent1.modelA.children())+
                          list(parent1.modelB.children())+
                          list(parent1.modelC[0:2])))
parent2 = nn.Sequential(*(list(parent2.modelA.children())+
                          list(parent2.modelB.children())+
                          list(parent2.modelC[0:2])))
root = RootModel(parent1, parent2, modeld)
print(root)

dataset = sparse.load_npz(buf).todense()
dataset0 = dataset[:,200000:201000]
dataset1 = dataset[:,201000:202000]
dataset2 = dataset[:,202000:203000]
genotype_dataset = TensorDataset(torch.tensor(dataset0, dtype=torch.float64), 
                                 torch.tensor(dataset1,dtype=torch.float64),
                                 torch.tensor(dataset2,dtype=torch.float64))
#print(genotype_dataset[:][1].shape)

train_loader = DataLoader(genotype_dataset, batch_size = 10)
wandb_logger = pl.loggers.WandbLogger(project="Imputation Autoencoder Project")
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=1,    # set the logging frequency
    gpus=-1,                # use all GPUs
    max_epochs=epochs,      # number of epochs
    deterministic=False,     # keep it deterministic
  )
args = {
    "learning_rate": 0.001,
    "architecture": "Autoencoder",
    "dataset": "1000 Genome Project",
    "layer_1": 16,
    "activation_1": "relu",
    "layer_2": 32,
    "activation_output": "log-softmax",
    "optimizer": "Adam",
    "loss": "NLL",
    "metric": "Accuracy",
    "epoch": 50,
    "batch_size": 10,
    "n_hidden_layers":5}
group_name = "root_block_group"
run=wandb.init(project="Imputation Autoencoder Project",save_code=False,
              group=group_name,entity="connia",name=name,
              mode="online",id=name,config=args,allow_val_change=True)
trainer.fit(parent, train_loader)
wandb.finish()
#Save model 
path = f"./model-1-2-3-{name}.pth"
torch.save(parent.state_dict(), path)


ParentModel(
  (modelA): Sequential(
    (0): Linear(in_features=1000, out_features=16, bias=True)
    (1): ReLU()
  )
  (modelB): Sequential(
    (0): Linear(in_features=1000, out_features=16, bias=True)
    (1): ReLU()
  )
  (modelC): Sequential(
    (0): Linear(in_features=32, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=2000, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2000, out_features=6000, bias=True)
    (5): ReshapeLogSoftmax()
  )
)
Sequential(
  (0): Linear(in_features=32, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=4000, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4000, out_features=12000, bias=True)
  (5): ReshapeLogSoftmax()
)
RootModel(
  (modelC1): Sequential(
    (0): Linear(in_features=1000, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1000, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=16, bias=Tr

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | modelA | Sequential | 16.0 K
1 | modelB | Sequential | 16.0 K
2 | modelC | Sequential | 12.0 M
--------------------------------------
12.1 M    Trainable params
0         Non-trainable params
12.1 M    Total params
48.290    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Training: 0it [00:00, ?it/s]

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▃▅▄▅▆▆▆▇▇▇▇▇▇█▇████████████████████████
accuracy_epoch,▁▃▄▅▆▆▆▇▇▇▇▇▇▇▇█████████████████████████
accuracy_step,▁▃▅▄▅▆▆▆▇▇▇▇▇▇█▇████████████████████████
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▆▅▅▄▃▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▅▅▄▃▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
accuracy,1.0
accuracy_epoch,0.99993
accuracy_step,1.0
epoch,49.0
loss,0.00018
train_loss_epoch,0.00083
train_loss_step,0.00018
trainer/global_step,2549.0
