In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pandas as pd
from transformers import AdamW, get_scheduler
from datasets import load_metric

from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from saveAndLoad import *

from torch.utils.data import DataLoader, Subset, Dataset
from sklearn.model_selection import train_test_split


class MLPClassifier(nn.Module):
    def __init__(self, config):
        super(MLPClassifier, self).__init__()

        self.input_dim = config.input_dim 
        self.num_labels = config.n_labels # number of labels for classifier
        self.linear1 = nn.Linear(config.input_dim,256)
        self.linear2 = nn.Linear(256,256)
        self.classifier = nn.Linear(256, config.n_labels) # FC Layer
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        logits = self.classifier(x)
        return logits
    
class LRClassifier(nn.Module):
    def __init__(self, config):
        super(LRClassifier, self).__init__()
        self.input_dim = config.input_dim 
        self.num_labels = config.n_labels # number of labels for classifier
        self.classifier = nn.Linear(config.input_dim,config.n_labels)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def forward(self, x):
        logits = self.classifier(x)
        return logits

In [15]:
from custom_dataset import Dataset_Binary

# LOAD DATA
data_dir = '../labeled_data/'
labeled_data = os.listdir(data_dir)
for ni,i in enumerate(labeled_data):print(ni,i)


data_emb = labeled_data[1]
print('\n',data_emb)
data_df = pd.read_csv(data_dir+data_emb)
data_emb = data_df['idxs'].values

data_bin = labeled_data[0]


labels = torch.tensor(data_df['int_label'].values,dtype=torch.long)


nlabels = len(data_df['int_label'].unique())
device = 'cuda:1'

# Create dataset
dataset = Dataset_Binary(data, labels, device)

# Create DataLoader
# dataloader = DataLoader(dataset, batch_size=100, shuffle=False, collate_fn=custom_collate)

## TEST/TRAIN SPLIT
test_size = .2
random_state = 42
batch_size = 20
indices = list(range(len(dataset)))

train_indices, test_indices = train_test_split(
    indices, 
    test_size=test_size, 
    random_state=random_state
)

train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

0 BINARYdata_CANCER_TYPE_3MinMutations_1696MinCancerType.csv
1 data_CANCER_TYPE_DETAILED_3MinMutations_1696MinCancerType.csv
2 BINARYdata_CANCER_TYPE_DETAILED_3MinMutations_1696MinCancerType.csv
3 data_CANCER_TYPE_DETAILED_0MinMutations_1696MinCancerType.csv
4 BINARYdata_CANCER_TYPE_DETAILED_0MinMutations_1696MinCancerType.csv
5 data_CANCER_TYPE_3MinMutations_169MinCancerType.csv
6 data_CANCER_TYPE_0MinMutations_1696MinCancerType.csv
7 BINARYdata_CANCER_TYPE_0MinMutations_169MinCancerType.csv
8 data_CANCER_TYPE_0MinMutations_169MinCancerType.csv
9 BINARYdata_CANCER_TYPE_DETAILED_0MinMutations_169MinCancerType.csv
10 data_CANCER_TYPE_DETAILED_0MinMutations_169MinCancerType.csv
11 BINARYdata_CANCER_TYPE_DETAILED_3MinMutations_169MinCancerType.csv
12 BINARYdata_CANCER_TYPE_3MinMutations_169MinCancerType.csv
13 BINARYdata_CANCER_TYPE_0MinMutations_1696MinCancerType.csv
14 data_CANCER_TYPE_3MinMutations_1696MinCancerType.csv
15 data_CANCER_TYPE_DETAILED_3MinMutations_169MinCancerType.csv

 

In [16]:
dataset[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:1'),
 tensor(13, device='cuda:1'))

In [20]:
import torch.optim as optim
from tqdm import tqdm

class Config:
    input_dim: int = 1448
    bias: bool = False
    n_labels: int = 17

print('n labels:',nlabels)
config = Config()
config.n_labels = nlabels

model = MLPClassifier(config)
model.to(device)

num_epochs = 2
learning_rate = 0.001

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    with tqdm(enumerate(train_loader), total=len(train_loader),desc='TRAINING') as pbar:
        for batch_idx, (data, target) in pbar:
            optimizer.zero_grad()
            output = model(data)
            # assert False
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            pbar.set_postfix({'Epoch':f'{epoch+1}/{num_epochs}, Loss: {loss.item():.4f}'})
            if batch_idx % 20000 == 0:
                print('')

        # Evaluation
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for data, target in tqdm(test_loader,desc='TESTING'):
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

        accuracy = 100 * correct / total
        print(f'Test Accuracy: {accuracy:.2f}%, ({correct} of {total})')

n labels: 20


TRAINING:   0%|          | 0/4097 [00:00<?, ?it/s]

TRAINING:   1%|          | 24/4097 [00:00<00:17, 238.26it/s, Epoch=1/2, Loss: 1.9484]




TRAINING: 100%|██████████| 4097/4097 [00:08<00:00, 491.76it/s, Epoch=1/2, Loss: 0.2482]
TESTING: 100%|██████████| 1025/1025 [00:00<00:00, 1288.66it/s]


Test Accuracy: 54.18%, (11096 of 20481)


TRAINING:   1%|          | 47/4097 [00:00<00:08, 469.84it/s, Epoch=2/2, Loss: 1.0022]




TRAINING: 100%|██████████| 4097/4097 [00:08<00:00, 509.46it/s, Epoch=2/2, Loss: 3.3877]
TESTING: 100%|██████████| 1025/1025 [00:00<00:00, 1291.90it/s]

Test Accuracy: 53.53%, (10963 of 20481)





In [18]:
import torch.optim as optim
from tqdm import tqdm

class Config:
    input_dim: int = 1448
    n_labels: int = 17

print('n labels:',nlabels)
config = Config()
config.n_labels = nlabels

model = LRClassifier(config)
model.to(device)

num_epochs = 2
learning_rate = 0.001

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    with tqdm(enumerate(train_loader), total=len(train_loader),desc='TRAINING') as pbar:
        for batch_idx, (data, target) in pbar:
            optimizer.zero_grad()
            output = model(data)
            # assert False
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            pbar.set_postfix({'Epoch':f'{epoch+1}/{num_epochs}, Loss: {loss.item():.4f}'})
            if batch_idx % 20000 == 0:
                print('')

        # Evaluation
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for data, target in tqdm(test_loader,desc='TESTING'):
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

        accuracy = 100 * correct / total
        print(f'Test Accuracy: {accuracy:.2f}%, ({correct} of {total})')

n labels: 20


TRAINING:   1%|▏         | 54/4097 [00:00<00:07, 533.14it/s, Epoch=1/2, Loss: 2.8442]




TRAINING: 100%|██████████| 4097/4097 [00:07<00:00, 574.63it/s, Epoch=1/2, Loss: 0.7962]
TESTING: 100%|██████████| 1025/1025 [00:00<00:00, 1364.04it/s]


Test Accuracy: 53.29%, (10914 of 20481)


TRAINING:   1%|▏         | 58/4097 [00:00<00:07, 576.63it/s, Epoch=2/2, Loss: 1.5100]




TRAINING: 100%|██████████| 4097/4097 [00:07<00:00, 578.03it/s, Epoch=2/2, Loss: 0.1916]
TESTING: 100%|██████████| 1025/1025 [00:00<00:00, 1347.74it/s]

Test Accuracy: 54.88%, (11240 of 20481)



