In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [2]:
def encode_dna(sequence):
    """Encodes DNA sequences to one-hot integer representation."""
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    return np.array([mapping[base] for base in sequence])

In [70]:
# Example DNA data
sequences = [
    "ATCGTACG",
    "CGTACGTA",
    "TACGTACG",
    "GCGTACGT",
    "ATCGATCG"
]
labels = [0, 1, 0, 1, 0] 

In [71]:
type(sequences)

list

In [68]:
# 2. Encode the sequences
encoded_sequences = np.array([encode_dna(seq) for seq in sequences])

In [69]:
encoded_sequences

array([[0, 3, 1, 2, 3, 0, 1, 2],
       [1, 2, 3, 0, 1, 2, 3, 0],
       [3, 0, 1, 2, 3, 0, 1, 2],
       [2, 1, 2, 3, 0, 1, 2, 3],
       [0, 3, 1, 2, 0, 3, 1, 2]])

In [6]:
# 3. Padding sequences to make them uniform
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = max(len(seq) for seq in encoded_sequences)
encoded_sequences = pad_sequences(encoded_sequences, maxlen=max_length, padding='post')

In [7]:
encoded_sequences

array([[0, 3, 1, 2, 3, 0, 1, 2],
       [1, 2, 3, 0, 1, 2, 3, 0],
       [3, 0, 1, 2, 3, 0, 1, 2],
       [2, 1, 2, 3, 0, 1, 2, 3],
       [0, 3, 1, 2, 0, 3, 1, 2]], dtype=int32)

In [15]:
# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=0.2, random_state=42)

In [16]:
y_train

[0, 0, 0, 1]

In [17]:
# Convert labels to categorical (one-hot encoding for multiclass)
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [18]:
y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [19]:
# 5. Define the CNN model
model = Sequential([
    Embedding(input_dim=4, output_dim=8, input_length=max_length),  # 4 nucleotides -> 8-d embedding
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 classes (binary classification)
])

2025-01-31 22:37:01.778462: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
# 6. Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
# 7. Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=4, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# 8. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.7827053666114807
Test Accuracy: 0.0


In [23]:
import re
def readFastaFile(Fasta): ## Working fine
    handle=open(Fasta,'r')
    lines=[]
    for line in handle:
        if('>' not in line):
            lineRead = re.split("\n",line)
            lines.append(lineRead[0])
            
    handle.close()
    
    return lines

In [133]:
tcdd_ahr=readFastaFile('TCDD_vs_DMSO_AHR.fasta')
dim_ahr=readFastaFile('DIM_vs_DMSO_AHR.fasta')
dim_era=readFastaFile('DIM_vs_DMSO_ERa.fasta')
e2_era=readFastaFile('E2_vs_DMSO_ERa.fasta')
res_era=readFastaFile('RES_vs_DMSO_ERa.fasta')

In [134]:
len(tcdd_ahr) ## 7879
len(dim_ahr) ## 42719

15054

In [135]:
tcdd_ahr = tcdd_ahr[0:5000]
dim_ahr = dim_ahr[0:5000]

In [136]:
len(tcdd_ahr) ## 5000
len(dim_ahr) ## 5000

5000

In [137]:
tcdd_ahr

['CCAGAAATAGCTGGCCAGAGGCCAGCAGGAGGGAAACACCAACCCGAGGAAAGAGAGACGGGGATTGGGAGAGAAATTCAGAAGAGACTGAGGcacgcacacagacagacacacccacccacacacaGATACGGATTCAAAGAGACATGCACACTCTGAGTTTCTGAGAGTAAGCCACTGTCAGTTCCTGGGGTGAGCCACCAGCCACATGGACACAATTTCC',
 'TGGCCAGAGGCCAGCAGGAGGGAAACACCGACCCGAGGAAAGAGAGACGGGGATTGGGAGAGAAATTCAGAAAAGATTGAGGcacgcacacagacagacacacccacccacacacagatacggattcaaagagacacgcacacTCTGAGTTTCTGAGAGTAAGCCACTGTCAGTTCCTGGGGTGAGCCACCAGCCACATGGACACAATTTCCTCTTTTTGG',
 'GCTTCCTGGTGCACATGAGTCAGAGCGATCCTGGGGGCTTCCCGCTGTCAGCGCTGACGCAAGGGTGGGACGAGGCGCAGGGCTCAGGCCGCCAGCCACAGGTCACGCACATCATGACTCACTCCCAGGTGGGAGGGGGCGGCGAGCTGGGGCGGCCTCTGGGAAGGGCGGGCGGCCTTGGC',
 'TGCCCCGCCCCACCCCTGACTTGCCAGTGAGTCCCAGACAGGCTGGCGGGATGACACAGGTCACTGTGACCACCTGAGTCACACGCCGTCACTGTGAGGCCGTGAGTGCCCCAGGCACCGGGACCTGGGGACTGTGCTCTGCGGCCTGTGTACCCCACAGAACCGGTTCCTTGGCACGAGGCCCCACCCCTCCACG',
 'TCAGAGGGTGGGCTGGGAGGTGCCTGTCTGTGTCTGGCCTGGCCTCCAGACCCTGCCTGGCTGGACCTGCTGTTCGTGCCTGTTTCCGATTTTATCCTCCAAACCAGACGCCCAGCCTGGTGCAAATGCAGGAGTGGAGTTTCCAGGGGGATGTGG

In [138]:
ahr_tcdd_label = [0] * 5000
ahr_dim_label = [1] * 5000

In [139]:
tcdd_ahr.extend(dim_ahr)

In [140]:
len(ahr_tcdd_label) ## 10000

5000

In [141]:
ahr_tcdd_label.extend(ahr_dim_label)

In [142]:
## sequence list is tcdd_ahr
## sequence label is ahr_label
ahr_tcdd_label[1:100]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [143]:
def encode_dna(sequence):
    """Encodes DNA sequences to one-hot integer representation."""
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3,'a': 0, 'c': 1, 'g': 2, 't': 3,'N': 4}
    return np.array([mapping[base] for base in sequence])

In [144]:
len(tcdd_ahr)

10000

In [145]:
# 2. Encode the sequences
encoded_sequences = np.array([encode_dna(seq) for seq in tcdd_ahr])

  encoded_sequences = np.array([encode_dna(seq) for seq in tcdd_ahr])


In [146]:
encoded_sequences[1:10]

array([array([3, 2, 2, 1, 1, 0, 2, 0, 2, 2, 1, 1, 0, 2, 1, 0, 2, 2, 0, 2, 2, 2,
              0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2,
              0, 2, 0, 1, 2, 2, 2, 2, 0, 3, 3, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 3,
              3, 1, 0, 2, 0, 0, 0, 0, 2, 0, 3, 3, 2, 0, 2, 2, 1, 0, 1, 2, 1, 0,
              1, 0, 1, 0, 2, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
              1, 0, 1, 0, 1, 0, 2, 0, 3, 0, 1, 2, 2, 0, 3, 3, 1, 0, 0, 0, 2, 0,
              2, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 3, 1, 3, 2, 0, 2, 3, 3, 3, 1, 3,
              2, 0, 2, 0, 2, 3, 0, 0, 2, 1, 1, 0, 1, 3, 2, 3, 1, 0, 2, 3, 3, 1,
              1, 3, 2, 2, 2, 2, 3, 2, 0, 2, 1, 1, 0, 1, 1, 0, 2, 1, 1, 0, 1, 0,
              3, 2, 2, 0, 1, 0, 1, 0, 0, 3, 3, 3, 1, 1, 3, 1, 3, 3, 3, 3, 3, 2,
              2])                                                              ,
       array([2, 1, 3, 3, 1, 1, 3, 2, 2, 3, 2, 1, 0, 1, 0, 3, 2, 0, 2, 3, 1, 0,
              2, 0, 2, 1, 2, 0, 3, 1, 1

In [147]:
# 3. Padding sequences to make them uniform
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = max(len(seq) for seq in encoded_sequences)
encoded_sequences = pad_sequences(encoded_sequences, maxlen=max_length, padding='post')

In [148]:
encoded_sequences[1:100]

array([[3, 2, 2, ..., 0, 0, 0],
       [2, 1, 3, ..., 0, 0, 0],
       [3, 2, 1, ..., 0, 0, 0],
       ...,
       [1, 3, 3, ..., 0, 0, 0],
       [2, 2, 2, ..., 0, 0, 0],
       [1, 3, 2, ..., 0, 0, 0]], dtype=int32)

In [150]:
len(ahr_tcdd_label)

10000

In [151]:
len(encoded_sequences)

10000

In [153]:
# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, ahr_tcdd_label, test_size=0.2, random_state=42)

In [154]:
X_train

array([[3, 3, 1, ..., 0, 0, 0],
       [2, 3, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 2, ..., 0, 0, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [2, 3, 1, ..., 0, 0, 0]], dtype=int32)

In [155]:
# Convert labels to categorical (one-hot encoding for multiclass)
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

In [156]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [177]:
# 5. Define the CNN model
model = Sequential([
    Embedding(input_dim=5, output_dim=10, input_length=max_length),  # 4 nucleotides -> 8-d embedding
    Conv1D(filters=128, kernel_size=3, activation='sigmoid'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 classes (binary classification)
])

In [130]:
? Sequential

In [178]:
# 6. Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [175]:
len(y_train)

8000

In [176]:
len(X_train)

8000

In [169]:
X_train

array([[3, 3, 1, ..., 0, 0, 0],
       [2, 3, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 2, ..., 0, 0, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [2, 3, 1, ..., 0, 0, 0]], dtype=int32)

In [179]:
# 7. Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [171]:
len(X_train) ## 8000
len(y_train)

8000

In [119]:
? model.fit

In [180]:
# 8. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.6933157444000244
Test Accuracy: 0.49399998784065247


In [None]:
## Filters was 32 initiually
## Training for 10 epochs with batch size of 4 and validation split of 0.2 gives test accu of 64%
## Training for 10 epochs with batch size of 16 and validation split of 0.2 gives test accuracy of 63.74%
## Training for 30 epochs with batch size of 4 and validation split of 0.2 gives test accuracy of 62.95%

## Now trying to increase the filters to 64
## Training for 30 epochs with batch size of 4 and validation split of 0.2 gives test accuracy of 63.95%
## Training for 30 epochs with batch size of 16 and validation split of 0.2 gives test accuracy of 63.4%


In [117]:
X_test

array([[3, 3, 2, ..., 0, 0, 0],
       [3, 2, 3, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 2, ..., 0, 0, 0],
       [2, 2, 3, ..., 0, 0, 0],
       [2, 2, 3, ..., 0, 0, 0]], dtype=int32)

In [118]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)