In [1]:
colab = True
# In case in Colab activate this cell by passing the corect path to the images
if colab==True: 
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [3]:
path_phen = ('/content/drive/MyDrive/projectGenomicsAiMaster/phenotSub.csv')
path_exp= ('/content/drive/MyDrive/projectGenomicsAiMaster/expreSub.csv')

In [4]:
def load_tcga_dataset():
  expr_df = pd.read_csv(path_exp, sep = ",", index_col="sig_id")

  # Load the list of gene names separately
  #gene_list = list(np.load(gene_list_file))
  #expr_df.columns = gene_list

  phenotype_df = pd.read_csv(path_phen, sep = ",", index_col="sig_id")

  return expr_df, phenotype_df

In [5]:
expr_df, phenotype_df = load_tcga_dataset()

In [6]:
print(expr_df.shape,phenotype_df.shape)

(144, 978) (144, 8)


In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [8]:
class Dataset(Dataset):
  """
  Dataset for binary classification Tumor/Normal
  """
  def __init__(self):
    
     # Compute categorical embedding
 
    self.labels = MultiColumnLabelEncoder(columns = ['cell_id']).fit_transform(phenotype_df)

    # Select the required feature

    self.labels = self.labels.loc[:,[ "cell_id"]]

    # Get corresponding gene expression profiles
    self.X = expr_df.loc[self.labels.index]## .index

  def __getitem__(self, index):
    sample = np.array(self.X.iloc[index], dtype=np.float32)
    label = np.array(self.labels.iloc[index], dtype=np.float32)

    ##series1=df.iloc[0,:]


    return sample, label

  def __len__(self):
    return len(self.labels)

In [9]:
dataset= Dataset()

In [10]:
train_set_size = int(len(dataset) * 0.7)
test_set_size = len(dataset) - train_set_size

In [11]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, 
                                                            lengths=[train_set_size, test_set_size], 
                                                            generator=torch.Generator().manual_seed(0))

In [12]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=2)### 11629/32 = 364
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=True, num_workers=2)### 4985/32 = 156

In [13]:
num_examples, num_genes = dataset.X.shape
print("Dataset for tumor/normal classification created with", num_examples, 
      "number of samples. Each sample contains the expression levels of", num_genes, "genes.")

Dataset for tumor/normal classification created with 144 number of samples. Each sample contains the expression levels of 978 genes.


In [23]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 500),
            nn.ReLU(),
            nn.Linear(500, 250),
            nn.ReLU(),
            nn.Linear(250, 10),

            )
        self.decoder = nn.Sequential(
            nn.Linear(10, 250),
            nn.ReLU(),
            nn.Linear(250, 500),
            nn.ReLU(),
            nn.Linear(500, input_dim),
            #nn.ReLU()
            )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [24]:
model = Autoencoder(num_genes)

In [25]:
model

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=978, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=250, bias=True)
    (3): ReLU()
    (4): Linear(in_features=250, out_features=10, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=10, out_features=250, bias=True)
    (1): ReLU()
    (2): Linear(in_features=250, out_features=500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=500, out_features=978, bias=True)
  )
)

In [41]:
criterion = nn.MSELoss( reduction="sum")

In [36]:
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [43]:
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
      
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        #print(outputs)
        loss = criterion(outputs[:, 0], inputs[:, 0])
        loss.backward()
        optimizer.step()  # Update the parameters of the model

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0

print('Finished Training')

[1,    50] loss: 0.917
[2,    50] loss: 0.643
[3,    50] loss: 0.379
[4,    50] loss: 0.254
[5,    50] loss: 0.144
Finished Training


In [44]:
trained_model_file = F'/content/drive/MyDrive/projectGenomicsAiMaster/trained_model_AutoenV1.pth'
torch.save(model.state_dict(), trained_model_file)