## Loading the HDF5 file into PyTorch




In [1]:
#!cp /content/drive/MyDrive/data/learn_cd4_91K.h5 /content
!cp /content/drive/MyDrive/data/learn_H7-hESC.h5 /content

In [None]:
# INPUT: 
# file_path: Path to the folder containing the hdf5 dataset, one or multiple files
# recursive: If True, searches for other h5 files in subdirectories 
# load_data: If True, loads all the data immediately into RAM, Use this if the dataset is
# fit into the memory. Otherwise, leave it False the data will load lazily.
# data_cache_size: Number of h5 files that can be cached in the cache ??
# transform: PyTorch transform to apply on the the dataset, like ToTensor, data augmentation etc.

## Discussion:
The number of epochs you require will depend on the size of your model and the variation in your dataset.<br>

The size of your model can be a rough proxy for the complexity that it is able to express (or learn). So a huge model can represent produce more nuanced models for datasets with higher diversity in the data, however would probably take longer to train i.e. more epochs.<br>

In [2]:
import h5py
from torch.utils import data
from pathlib import Path
import numpy as np

class HDF5Dataset(data.Dataset):

  def __init__(self,file_path,types,transform=None):
    super().__init__() #1
    self.type=types
    self.transform=transform

    # search for all h5 files
    p=Path(file_path)
    files=sorted(p.glob('*.h5'))#2
    print("FILE:",files)
    self.h5_file=h5py.File(files[0],'r')
    if len(files) < 1:
      raise RuntimeError('No hdf5 dataset found')#3
    
    if not type(self.type)==str:
      raise RuntimeError('Input type: \'train\',\'test\',\'valid\'')
    
    # Either load the file here and store all the data in some local variable
    # or can be differed to the actual call via getitem, in that case the file pointer
    # will be constantly open and close in each batch?? call. Lets implement the later one
  def __getitem__(self,index):
    data_tag=self.type+'_in' 
    label_tag=self.type+'_out'
    if torch.is_tensor(index):
      index=index.tolist()

    data=np.array(self.h5_file[data_tag][index,:]) #4
    label=np.array(self.h5_file[label_tag][index,:]) #5

    if self.transform:
      data=self.transform(data)
      label=self.transform(label)
    else:
      data=torch.from_numpy(data).squeeze(1)
      data=data.type(torch.FloatTensor)
      label=torch.from_numpy(label)
      label=label.type(torch.FloatTensor)

    return (data,label)

  def __len__(self):
    data_name=self.type+'_in' #6
    #return len(self.h5_file[data_name])
    if data_name=='train_in':
      return 80000
    else:
      return 4500


  def _close(self):
    h5_file.close()

In [3]:
import torch
CLASSES=1
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [28]:
# WEIGHT INITIALIZATION CHANGED
import torch.nn as nn
import torch.optim as optim
import itertools

class Basset_cnn(nn.Module):

  def __init__(self,num_classes=CLASSES):
    super().__init__()
    self.net=nn.Sequential(
        nn.Conv1d(in_channels=4,out_channels=300,kernel_size=19,stride=1),
        nn.BatchNorm1d(300),
        nn.ReLU(),
        nn.MaxPool1d(kernel_size=3,stride=2),
        nn.Conv1d(in_channels=300,out_channels=200,kernel_size=11,stride=1),
        nn.BatchNorm1d(200),
        nn.ReLU(),
        nn.MaxPool1d(kernel_size=4,stride=2),
        nn.Conv1d(in_channels=200,out_channels=200,kernel_size=7,stride=1),
        nn.BatchNorm1d(200),
        nn.ReLU(),
        nn.MaxPool1d(kernel_size=4,stride=2),
    )

    self.fc_layer=nn.Sequential(
        nn.Linear(in_features=200*65,out_features=1000),
        nn.ReLU(),
        nn.Dropout(p=0.3,inplace=True),
        nn.Linear(in_features=1000,out_features=1000),
        nn.ReLU(),
        nn.Dropout(p=0.3,inplace=True),
        nn.Linear(in_features=1000,out_features=CLASSES),
        #nn.Sigmoid(),
    )
    self.init_weights()

  def init_weights(self):
    for net,lin in itertools.zip_longest(self.net,self.fc_layer):
      if isinstance(net,nn.Conv1d):
        nn.init.xavier_normal_(net.weight)#,mode='fan_out',nonlinearity='relu')
        if net.bias is not None:
          nn.init.constant_(net.bias,0)
      elif isinstance(net,nn.BatchNorm1d):
        nn.init.constant_(net.weight,1.0)
        nn.init.constant_(net.bias,0.0)
      elif isinstance(lin,nn.Linear):
        nn.init.normal_(lin.weight,mean=0.0,std=0.01)
        if lin.bias is not None:
          nn.init.constant_(lin.bias,0)

  def forward(self,x):
    x=self.net(x)
    x=torch.flatten(x,1)
    return self.fc_layer(x)
      

In [29]:
basset_model=Basset_cnn()

train_dataset=HDF5Dataset('/content','train')
valid_dataset=HDF5Dataset('/content','valid')

train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True)
valid_dataloader=torch.utils.data.DataLoader(valid_dataset,batch_size=32,shuffle=True)

#optimizer=optim.AdamW(params=basset_model.parameters(),lr=0.00005)
optimizer=optim.Adam(params=basset_model.parameters(),lr=0.0007)
#optimizer=optim.RMSprop(params=basset_model.parameters(),lr=0.002,momentum=0.98)
loss_fn=nn.BCEWithLogitsLoss().cuda()
#loss_fn=nn.BCELoss().cuda()

FILE: [PosixPath('/content/learn_H7-hESC.h5')]
FILE: [PosixPath('/content/learn_H7-hESC.h5')]


In [None]:
for data,label in train_dataloader:
  print(data.shape,data.dtype,label.shape)
  print(data)
  break

In [30]:
loss_list=[]
accuracy_list=[]
def train():
  basset_model.to(device)
  basset_model.train()
  loss_train=0.0
  counter_train=0.0
  for epoch in range(5):
    for data,label in train_dataloader:
      data=data.to(device)
      label=label.to(device)       
      output=basset_model(data)
      loss=loss_fn(output,label)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      loss_train+=loss.item()
      #loss_list.append(loss.data())
      counter_train+=1
      #if counter_train % 800 ==0:
        #print("Training Loss:",loss.item())
    print("Training loss for epoch:",epoch,":-",loss_train/counter_train)

    correct=0
    loss_valid=0.0
    counter_valid=0.0
    basset_model.eval()
    with torch.no_grad():
      for data,label in valid_dataloader:
        data=data.to(device)
        label=label.to(device)
        y_hat=basset_model(data)
        #y_pred=(y_hat>0.0).float()
        y_pred=torch.round(torch.sigmoid(y_hat))
        loss=loss_fn(y_hat,label)
        loss_valid+=loss.item()        
        correct+=(y_pred==label).sum().item()
        counter_valid+=1
        #if counter_valid % 50 ==0:
          #print("Valid Loss:",loss.item())
        
      print("validation loss for epoch:",epoch,":-",loss_valid/counter_valid)
      print("Correct",correct,"length",len(valid_dataset))
      accuracy=correct/len(valid_dataset)
      accuracy_list.append(accuracy)
      print(accuracy)
    basset_model.train()

In [32]:
# If accuracy decreases with increased epochs, do one or combination of the following:
# 1. Reduce the learning rate to suppose 0.001 or even 0.0001.
# 2. Set Dropout to 0.2/0.3, keep it uniform across layers.
# 3. Try Decreasing the batch size.
# 4. Change the optimizer. 

train()

Training loss for epoch: 0 :- 0.5541865759730339
validation loss for epoch: 0 :- 0.5991834693766654
Correct 3032 length 4500
0.6737777777777778
Training loss for epoch: 1 :- 0.5357631686627865
validation loss for epoch: 1 :- 0.6103051030044014
Correct 2971 length 4500
0.6602222222222223
Training loss for epoch: 2 :- 0.5117158401648203
validation loss for epoch: 2 :- 0.9054735645334772
Correct 2706 length 4500
0.6013333333333334
Training loss for epoch: 3 :- 0.48290564197003844
validation loss for epoch: 3 :- 1.0166817053835442
Correct 2591 length 4500
0.5757777777777778
Training loss for epoch: 4 :- 0.44986986310660837
validation loss for epoch: 4 :- 1.3161077047070713
Correct 2399 length 4500
0.5331111111111111


In [12]:
test_dataset=HDF5Dataset('/content','test')
test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=64,shuffle=True)

predicted=[]
labels=[]
correct=0
basset_model.eval()
with torch.no_grad():
  for data,label in test_dataloader:
    data=data.to(device)
    label=label.to(device)
    y_hat=basset_model(data)
    y_pred=(y_hat>0.0).float()
    correct+=(y_pred==label).sum().item()
    predicted.append(y_pred.to('cpu'))
    labels.append(label.to('cpu'))
accuracy=correct/len(test_dataset)
print(accuracy)


FILE: [PosixPath('/content/learn_cd4_91K.h5')]
0.721433253222635


In [38]:
predicted_list=[]
labels_list=[]
for i in range(len(predicted)):
  temp=predicted[i].numpy()
  temp1=labels[i].numpy()
  for j in range(len(temp)):
    predicted_list.append(temp[j])
    labels_list.append(temp1[j])

len(predicted_list)==len(labels_list)

True

In [None]:
true=0
false=0
true_positive=0
true_negative=0
false_negative=0
false_positive=0
for i in range(len(labels_list)):
  if labels_list[i]==1.0:
    true+=1
  else:
    false+=1
  if labels_list[i]==1.0 and predicted_list[i]==1.0:
    true_positive+=1
  if labels_list[i]==0.0 and predicted_list[i]==0.0:
    true_negative+=1
  if labels_list[i]==1.0 and predicted_list[i]==0.0:
    false_negative+=1
  if labels_list[i]==0.0 and predicted_list[i]==1.0:
    false_positive+=1

print("True: ",true,"False: ",false)
print("True positive: ",true_positive,"True negative: ",true_negative)
print("False negative: ",false_negative,"False positive: ",false_positive)
print("precision: ",true_positive/(true_positive+false_positive))
print("recall: ",true_positive/(true_positive+false_negative))
print("F1 Score: ",true_positive/(true_positive+0.5*(false_positive+false_negative)))

In [25]:
import numpy as np
ll = np.squeeze(np.asarray(labels_list, dtype=np.int),axis=1)
pl=np.squeeze(np.asarray(predicted_list,dtype=np.int),axis=1)
ll.shape, pl.shape

((4577,), (4577,))

In [26]:
from sklearn.metrics import roc_auc_score
roc_auc_score(ll,pl)

0.7203572051681728

In [41]:
import pickle
pickle.dump(predicted_list,open('/content/predicted_list.pkl','wb'))
pickle.dump(labels_list,open('/content/labels_list.pkl','wb'))

In [42]:
!cp /content/predicted_list.pkl /content/drive/MyDrive/data
!cp /content/labels_list.pkl /content/drive/MyDrive/data

In [2]:
import pickle
predicted_list=pickle.load(open('/content/drive/MyDrive/data/predicted_list.pkl','rb'))
labels_list=pickle.load(open('/content/drive/MyDrive/data/labels_list.pkl','rb'))
len(predicted_list)==len(labels_list)

True

In [None]:
torch.save(basset_model.state_dict(),'/content/drive/MyDrive/models/basset_adamw_lr:0007,batch:16.pt')

In [None]:
#Load
'''basset_model=Basset_cnn()
basset_model.load_state_dict(torch.load('/content/drive/MyDrive/models/basset_adamw_lr:0007,batch:16.pt'))
basset_model.eval()
'''


In [None]:
import torch

valid_dataset=HDF5Dataset('/content','valid')
validloader=torch.utils.data.DataLoader(valid_dataset,batch_size=10,shuffle=True)

In [None]:
for x,y in validloader:
  print(x.dtype,y.dtype)
  break

torch.bool torch.int8


In [None]:
valid_dataset._close()

In [None]:
from pathlib import Path

p=Path('/content')
files=sorted(p.glob('*.h5'))

In [None]:
str(files[0])==str(files[0].resolve())

True

In [None]:
import h5py
import numpy as np
dat=None
with h5py.File(files[0],'r') as h5_file:
  for gname,group in h5_file.items():
    dat=np.array(h5_file['target_labels'])

dat=[str(t,'utf8') for t in dat]
dat

['CD4+']

In [None]:
import torch
import h5py
import numpy as np

h5=h5py.File(files[0],'r')
temp=torch.from_numpy(np.array(h5['valid_in']))

In [None]:
import torch.nn as nn
import torch

conv1=nn.Conv1d(4,300,19,stride=1)
input=torch.randn(3,4,600)
conv1_out=conv1(input)
print("Conv1:",conv1_out.shape)

pool1=nn.MaxPool1d(3,stride=2)
pool1_out=pool1(conv1_out)
print("Pool1:",pool1_out.shape)

conv2=nn.Conv1d(300,200,11,stride=1)
conv2_out=conv2(pool1_out)
print("Conv2:",conv2_out.shape)

pool2=nn.MaxPool1d(4,stride=2)
pool2_out=pool2(conv2_out)
print("Pool2:",pool2_out.shape)

conv3=nn.Conv1d(200,200,7,stride=1)
conv3_out=conv3(pool2_out)
print("Conv3:",conv3_out.shape)

pool3=nn.MaxPool1d(4,stride=2)
pool3_out=pool3(conv3_out)
print("Pool3:",pool3_out.shape)

linear1=nn.Linear(in_features=200*65,out_features=1000)
flat_data=torch.flatten(pool3_out,1)
linear1_out=linear1(flat_data)
print("Linear1:",linear1_out.shape)

linear2=nn.Linear(in_features=1000,out_features=1000)
linear2_out=linear2(linear1_out)
print("Linear2:",linear2_out.shape)

linear3=nn.Linear(in_features=1000,out_features=1)
linear3_out=linear3(linear2_out)
print("Linear3:",linear3_out.shape)

Conv1: torch.Size([3, 300, 582])
Pool1: torch.Size([3, 300, 290])
Conv2: torch.Size([3, 200, 280])
Pool2: torch.Size([3, 200, 139])
Conv3: torch.Size([3, 200, 133])
Pool3: torch.Size([3, 200, 65])
Linear1: torch.Size([3, 1000])
Linear2: torch.Size([3, 1000])
Linear3: torch.Size([3, 1])


In [None]:
temp2

['CD4+']

In [None]:
temp3=[str(t,'utf8') for t in h5['test_headers']]