<a href="https://colab.research.google.com/github/chandan9t8/UnivProjects/blob/main/Sequence%20Classification%20using%20CNN/CNN_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch.nn as nn

In [None]:
features = pd.read_csv('./train_features.csv')
labels = pd.read_csv('./train_labels.csv')

In [None]:
#merge features and labels
data = pd.merge(features[['ids', 'data']], labels[['ids', 'labels']], on='ids', how='inner')
data.head(5)

Unnamed: 0,ids,data,labels
0,0,-------------------------------------------GCT...,663
1,1,TACATTATATTTTATTCTTGGATTATGATCAGGAATAGTAGGAACT...,862
2,2,AATATTATATTTTATCTTTGCTATATGATCTGGAATAATTGGATCC...,650
3,3,TACCTTATATTTTATTTTTGGAATTTGATCAGGTATAGTGGGAACT...,300
4,4,AACATTATACTTCATTTTTGGAATATGATCCGGTATAATTGGTACT...,785


In [None]:
#perform one-hot encoding

one_hot_encoding = {
    'A': [1, 0, 0, 0, 0],
    'C': [0, 1, 0, 0, 0],
    'G': [0, 0, 1, 0, 0],
    'T': [0, 0, 0, 1, 0],
    '-': [0, 0, 0, 0, 1],
}

def encode_sequence(seq):
    return np.array([one_hot_encoding.get(nuc, [0, 0, 0, 0, 1]) for nuc in seq])

In [None]:
data['encoded_data'] = data['data'].apply(lambda x: encode_sequence(x))

In [None]:
#check if the DNA sequences are of the same length

lengths = data['data'].apply(len)
are_lengths_same = lengths.nunique() == 1

print(are_lengths_same)

False


In [None]:
data['sequence_length'] = data['data'].apply(len)

In [None]:
data.head(15)

Unnamed: 0,ids,data,labels,encoded_data,sequence_length
0,0,-------------------------------------------GCT...,663,"[[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0...",658
1,1,TACATTATATTTTATTCTTGGATTATGATCAGGAATAGTAGGAACT...,862,"[[0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
2,2,AATATTATATTTTATCTTTGCTATATGATCTGGAATAATTGGATCC...,650,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1...",658
3,3,TACCTTATATTTTATTTTTGGAATTTGATCAGGTATAGTGGGAACT...,300,"[[0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
4,4,AACATTATACTTCATTTTTGGAATATGATCCGGTATAATTGGTACT...,785,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
5,5,GGAGCATGATCTGGAATAATTGGAACTTCTCTAAGAATTTTAATTC...,723,"[[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [1, 0, 0, 0...",639
6,6,AACTTTATACTTTATTTTTGGAGCTTGAGCTGGAATAGTAGGTACT...,1059,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
7,7,AACATTATATTTTATTTTCGGAATTTGATCAGGAATAGTAGGAACA...,476,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
8,8,AACTTTATATTTTATTTTTGGAGCTTGATCTAGAATAGTGGGAACT...,378,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658
9,9,AACTTTATATTTCCTATTTGGTAGATGAGCAGGAATAGTGGGAACT...,1194,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0...",658


In [None]:
#max length of the sequence

data['sequence_length'].max()

1058

In [None]:
#padding all sequences to max length

def pad_sequence_to_max(sequence, target_length=1058, pad_value=[0, 0, 0, 0, 0]):
    current_length = sequence.shape[0]
    padding_needed = target_length - current_length
    padding_array = np.tile(pad_value, (padding_needed, 1))
    padded_sequence = np.vstack([sequence, padding_array])
    return padded_sequence

In [None]:
data['padded_encoded_data'] = data['encoded_data'].apply(lambda x: pad_sequence_to_max(np.array(x)))

In [None]:
#reverify if all DNA lengths are equal

lengths = data['padded_encoded_data'].apply(len)
are_lengths_same = lengths.nunique() == 1

print(are_lengths_same)

True


In [None]:
#adding an extra dimension to fit the CNN
data['padded_encoded_data_with_channel'] = data['padded_encoded_data'].apply(lambda x: torch.tensor(x).unsqueeze(0))

In [None]:
#no of unique labels

data['labels'].nunique()

1213

In [None]:
data_tensor = torch.stack(data['padded_encoded_data_with_channel'].tolist())
data_tensor = torch.squeeze(data_tensor, dim=1)
data_tensor = data_tensor.transpose(1, 2)
data_tensor = data_tensor.long()

labels_tensor = torch.tensor(data['labels'].values)
labels_tensor = labels_tensor.long()
labels_tensor = labels_tensor - 1            #making labels 0-indexed as crossentropy() works with 0-indexed classes

# Create a dataset and dataloader
dataset = TensorDataset(data_tensor, labels_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
train_data, val_data, train_labels, val_labels = train_test_split(data_tensor, labels_tensor, test_size=0.2, random_state=42)

# Create TensorDatasets for training and validation sets
train_dataset = TensorDataset(train_data, train_labels)
val_dataset = TensorDataset(val_data, val_labels)

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
#model

class CNN1D(nn.Module):
    def __init__(self, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=5, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(2)

        # fully connected layer
        self.fc1 = nn.Linear(64 * 264, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
model = CNN1D(num_classes=1213)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [None]:
num_epochs = 50

for epoch in range(num_epochs):

    #training step
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    #Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.float()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}')


Epoch 1, Train Loss: 7.0269373759101414, Val Loss: 6.889943492746799
Epoch 2, Train Loss: 6.584162736780503, Val Loss: 6.348572022446962
Epoch 3, Train Loss: 5.797284573947682, Val Loss: 5.4317348114798
Epoch 4, Train Loss: 4.493812922870411, Val Loss: 4.025169847167541
Epoch 5, Train Loss: 2.868033738977769, Val Loss: 2.6534746517644865
Epoch 6, Train Loss: 1.7624635564579683, Val Loss: 1.8199627232328754
Epoch 7, Train Loss: 1.1614666982258068, Val Loss: 1.3672675267558232
Epoch 8, Train Loss: 0.8266467222045449, Val Loss: 1.0655569219700645
Epoch 9, Train Loss: 0.6181210953347823, Val Loss: 0.9159337751497733
Epoch 10, Train Loss: 0.4843182124285137, Val Loss: 0.7767045513213238
Epoch 11, Train Loss: 0.38669941174633365, Val Loss: 0.6927062959593033
Epoch 12, Train Loss: 0.32438663277117646, Val Loss: 0.619993849648772
Epoch 13, Train Loss: 0.26604408322011724, Val Loss: 0.5495841948785515
Epoch 14, Train Loss: 0.22839366389548077, Val Loss: 0.5121436112812746
Epoch 15, Train Loss: 

In [None]:
test_features = pd.read_csv('./test_features.csv')
test_features.head()

Unnamed: 0,ids,data
0,0,--------------------GATTTTGATCTGGAATNCTAGGATTN...
1,1,AACTTTATATTTTATCTTCGGATTTTGATCAGGTATACTAGGATTA...
2,2,AACACTATATTTTATTTTTGGAATTTGAGCAGGAATAATAGGATTA...
3,3,AATTCTATATTTTATCTTTGCTATTTGATCAGGAATAATTGGATCC...
4,4,AATACTTTATTTTATTTTTGCTATATGGGCAGGAATATTAGGATCT...


In [None]:
len(test_features)

4243

In [None]:
test_features['encoded_data'] = test_features['data'].apply(lambda x: encode_sequence(x))

In [None]:
test_features.head()

Unnamed: 0,ids,data,encoded_data
0,0,--------------------GATTTTGATCTGGAATNCTAGGATTN...,"[[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0..."
1,1,AACTTTATATTTTATCTTCGGATTTTGATCAGGTATACTAGGATTA...,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0..."
2,2,AACACTATATTTTATTTTTGGAATTTGAGCAGGAATAATAGGATTA...,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0..."
3,3,AATTCTATATTTTATCTTTGCTATTTGATCAGGAATAATTGGATCC...,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1..."
4,4,AATACTTTATTTTATTTTTGCTATATGGGCAGGAATATTAGGATCT...,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1..."


In [None]:
len(test_features)

4243

In [None]:
test_features['encoded_data'].apply(len).max()

862

In [None]:
test_features['padded_encoded_data'] = test_features['encoded_data'].apply(lambda x: pad_sequence_to_max(np.array(x),target_length = 1058))

In [None]:
lengths = test_features['padded_encoded_data'].apply(len)
are_lengths_same = lengths.nunique() == 1

print(are_lengths_same)

True


In [None]:
#adding an extra dimension to fit the CNN
test_features['padded_encoded_data_with_channel'] = test_features['padded_encoded_data'].apply(lambda x: torch.tensor(x).unsqueeze(0))

In [None]:
test_data_tensor = torch.stack(test_features['padded_encoded_data_with_channel'].tolist())
test_data_tensor = torch.squeeze(test_data_tensor, dim=1)
test_data_tensor = test_data_tensor.transpose(1, 2)
test_data_tensor = test_data_tensor.long()

In [None]:
test_dataset = TensorDataset(test_data_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model.eval()  #evaluation mode
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs[0].float()
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.tolist())

#adjusting for the one off predictions
predictions = [x + 1 for x in predictions]

In [None]:
df = pd.DataFrame({'ids': [x for x in range(0,4243)], 'labels': predictions})

# To save the DataFrame to a text file
df.to_csv('predictions.txt', sep=' ', index=True, header=True)

In [None]:
from google.colab import files
files.download('predictions.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>