In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import gc
import gzip
import re
import shutil
import glob
import multiprocessing as mp
import errno
from Bio import SeqIO

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn
GPU_PRESENT = [torch.cuda.device (i) for i in range (torch.cuda.device_count ())]!=[]
print("GPU Detected?: "+str(GPU_PRESENT))
if GPU_PRESENT:
    torch.set_default_device('cuda')
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")
    #import cupy as cp


fastqs = glob.glob("http:\\/home/grant/NNRNA/fastqs/*/*.fastq.gz")

PAD_SIZE=1000
BATCH_SIZE=10000

GPU Detected?: True
Using cuda device


In [5]:
fastqs = glob.glob("\\\\wsl.localhost\\Debian\\home\\grant\\NNRNA\\fastqs\\*\\*.fastq.gz")

In [23]:
enc.fit_transform(np.array(["A","T", "C", "G", "N"]).reshape(-1,1))

def get_seqios(file):
    seqs = []
    with gzip.open(file, 'rt') as fastq:
        for index, record in enumerate(SeqIO.parse(fastq, 'fastq')):
            seqs.append(str(record.seq))

    return seqs

def parse_reads(record, pad_size=PAD_SIZE):
    x_in = np.array(list(record))
    arr = enc.fit_transform(x_in.reshape(-1,1)).toarray()
    delta = len(arr)-pad_size

    if delta>0:
        #random crop
        shift=np.random.randint(0,delta)
        x_out = arr[shift:shift+pad_size]

    else:
        arr.resize((pad_size, 4), refcheck=False)
        x_out=arr

    return x_out

                  
class Dataset(torch.utils.data.Dataset):
    def __init__(self, paths):
        self.data = paths

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        path = self.data[idx]
        out = parse_reads(path)
        out = out.reshape(PAD_SIZE, 4)
        out = torch.tensor(out).to(torch.float)
        return out
                  
class AE(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = torch.nn.Sequential(
            #torch.nn.Linear(PAD_SIZE, 400, 1),
            #torch.nn.ReLU(),
            
            torch.nn.Conv1d(PAD_SIZE, 800, 4),
            torch.nn.ReLU(),
            torch.nn.Conv1d(800, 400, 1),
            torch.nn.ReLU(),
            torch.flatten(),
            torch.nn.Linear(400, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )
         
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 400),
            torch.nn.ReLU(),
            torch.nn.Linear(400, 800),
            torch.nn.ReLU(),
            torch.nn.Linear(800, (4*PAD_SIZE)),
            torch.nn.Sigmoid()
            
            #torch.nn.Linear(400, PAD_SIZE, 1, 1),
            #torch.nn.Sigmoid()
        )
 
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
                  
def train_encoder(model, dataset, epochs, steps, batch_size, lr=0.1, decay=1e-9):
    num_epochs = epochs
    loss_function = torch.nn.MSELoss()

    optimizer = torch.optim.Adam(model.parameters(),lr = lr ,weight_decay = decay)

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=0,
                                             generator=torch.Generator(device='cuda'))

    losses=[]
    for epoch in range(num_epochs):
        print("epoch"+str(epoch))
        data = next(iter(dataloader))
        for batch_index, doc in enumerate(data):
            recon = model(doc)
            #Loss function
            loss = loss_function(recon, doc)
            if batch_index%10==0:
                print("Batch: "+str(batch_index))
                print("loss"+str(loss))

            # Gradients are set to zero,
            # Gradient is computed and stored.
            # .step() performs parameter update.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Storing the losses
            losses.append(loss)


            
            
dataset = Dataset(fastqs)
model = AE()
epochs = 10
steps = 10
batch_size=100
lr = 0.1
decay = 1e-7

train_encoder(model,
             dataset,
             epochs,
             steps,
             batch_size,
              lr,
              decay
             )
                  

TypeError: flatten() received an invalid combination of arguments - got (), but expected one of:
 * (Tensor input, int start_dim, int end_dim, name out_dim)
 * (Tensor input, int start_dim, int end_dim)
 * (Tensor input, name start_dim, name end_dim, name out_dim)
 * (Tensor input, tuple of names dims, name out_dim)


In [63]:
torch.save(model.state_dict(), "./RNA_Autoencoder.state_dict")

In [10]:
torch.nn.Linear

torch.nn.modules.linear.Linear

In [24]:
torch.flatten

<function torch._VariableFunctionsClass.flatten>