# 1. Read and parse the training data

In [1]:
import pandas as pd

# Replace with your actual GitHub repo URL
url = "https://raw.githubusercontent.com/dgront/chem-ml/refs/heads/main/INPUTS/membrane_proteins/pdbtm.dat"

try:
  df = pd.read_csv(url, sep='\t')
  print(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
  print(f"An error occurred: {e}")


  pdb_id chain_code                                           sequence  \
0   1a0b          _  TTEENSKSEALLDIPMLEQYLELVGPKLITDGLAVFEKMMPGYVSV...   
1   2a00          A  MGHHHHHHHHHHSSGHGGRHNRQASEFIPAQGVDEKTLADAAQLAS...   
2   7a0w          A  MKHLHRFFSSDASGGIILIIAAALAMLMANMGATSGWYHDFLETPV...   
3   7a0w          B  MKHLHRFFSSDASGGIILIIAAALAMLMANMGATSGWYHDFLETPV...   
4   2a06          A  TATYAQALQSVPETQVSQLDNGLRVASEQSSQPTCTVGVWIDAGSR...   

                                        region_types  
0  UUUUUU1111111111111111111111111111111111111111...  
1  UUUUUUUUUUUUUUUUUUUU11111111111111111111111111...  
2  UUUUUUUUUUU111HHHHHHHHHHHHHH222222222222222222...  
3  UUUUUUUUUU1111HHHHHHHHHHHHHH222222222222222222...  
4  2222222222222222222222222222222222222222222222...  


In [2]:
sequences = df['sequence'].tolist()
region_types = df['region_types'].tolist()
print(len(sequences), len(region_types))

65705 65705


# 2. Encode both inputs and outputs in one-hot encoding

In [3]:
import numpy as np

def one_hot_encode(char_list, input_string):
    """
    One-hot encode a string based on a given list of characters.

    Parameters:
        char_list (list or str): List of unique characters defining the encoding space.
        input_string (str): The string to encode.

    Returns:
        np.ndarray: A 2D numpy array representing the one-hot encoded string.
    """
    # Create a mapping from character to index
    char_to_index = {char: idx for idx, char in enumerate(char_list)}
    num_chars = len(char_list)

    # Initialize an empty matrix
    one_hot_matrix = np.zeros((len(input_string), num_chars), dtype=int)

    # Fill in the one-hot matrix
    for i, char in enumerate(input_string):
        if char in char_to_index:
            one_hot_matrix[i, char_to_index[char]] = 1
        else:
            raise ValueError(f"Character '{char}' not found in the provided character list.")

    return one_hot_matrix

In [4]:
aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
tm_letters = list("12HUL.FIB")

In [5]:
r0 = one_hot_encode(tm_letters, region_types[0])

Solution with padded tensors

In [8]:
%%script false --no-raise-error
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

N = len(region_types)
N = 1000
labels = []
for i in range(N):
  labels.append(one_hot_encode(tm_letters, region_types[i]))

padded_labels = pad_sequences(labels, padding='post', dtype='float32')

padded_labels = tf.constant(padded_labels)
padded_labels.shape


TensorShape([1000, 1596, 9])

Solution with separate tensors

In [9]:
#N = len(region_types)
N = 10000
labels = []
features = []
for i in range(N):
  try:
    li = one_hot_encode(tm_letters, region_types[i])
    fi = one_hot_encode(aa_letters, sequences[i])
    labels.append(li)
    features.append(fi)
  except:
    # print(i, sequences[i])
    pass


In [10]:
print(len(features),len(features[0]),len(labels),len(labels[0]))

9628 125 9628 125


In [11]:
from sklearn.model_selection import train_test_split
# split data into two sets: training and validation

features_train, features_val, labels_train, labels_val = train_test_split(features, labels, test_size=0.3, random_state=42)
print([len(lista) for lista in [features_train, features_val, labels_train, labels_val ]])

[6739, 2889, 6739, 2889]


In [12]:

def create_dataset(features_list, labels_list):
    return tf.data.Dataset.from_generator(
        lambda: zip(features_list, labels_list),
        output_signature=(
            tf.TensorSpec(shape=(None, 20), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 9), dtype=tf.float32)
        )
    ).batch(1)

training_dataset = create_dataset(features_train, labels_train)
validation_dataset = create_dataset(features_val, labels_val)

In [13]:
# prompt: describe shape of training_dataset and validation_dataset

import numpy as np

# Assuming features_train, labels_train, features_val, and labels_val are defined as in the provided code

# Inspect the shapes of elements in the datasets
for ft, lb in training_dataset.take(1):  # Take one batch from the training dataset
    print("Training Dataset:")
    print("Features shape:", np.array(ft).shape)
    print("Labels shape:", np.array(lb).shape)

for ft, lb in validation_dataset.take(1): # Take one batch from the validation dataset
    print("\nValidation Dataset:")
    print("Features shape:", np.array(ft).shape)
    print("Labels shape:", np.array(lb).shape)


Training Dataset:
Features shape: (1, 164, 20)
Labels shape: (1, 164, 9)

Validation Dataset:
Features shape: (1, 32, 20)
Labels shape: (1, 32, 9)


The CNN network

In [14]:
input_layer = tf.keras.Input(shape=(None, 20))  # variable-length sequences

x = tf.keras.layers.Conv1D(32, kernel_size=11, padding='same', activation='relu')(input_layer)
x = tf.keras.layers.Conv1D(32, kernel_size=7, padding='same', activation='relu')(x)

output = tf.keras.layers.Conv1D(9, kernel_size=1, activation='softmax')(x)

model = tf.keras.Model(inputs=input_layer, outputs=output)

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  # use sparse_categorical_crossentropy if labels are integers
    metrics=['categorical_accuracy']
)

In [15]:
hist = model.fit(training_dataset, validation_data=validation_dataset, epochs=10)

Epoch 1/100
   6731/Unknown [1m31s[0m 4ms/step - categorical_accuracy: 0.4632 - loss: 1.2146



[1m6739/6739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 5ms/step - categorical_accuracy: 0.4632 - loss: 1.2145 - val_categorical_accuracy: 0.5073 - val_loss: 1.0829
Epoch 2/100
[1m6739/6739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - categorical_accuracy: 0.5299 - loss: 1.0573 - val_categorical_accuracy: 0.5466 - val_loss: 1.0331
Epoch 3/100
[1m6739/6739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - categorical_accuracy: 0.5597 - loss: 1.0065 - val_categorical_accuracy: 0.5613 - val_loss: 1.0101
Epoch 4/100
[1m6739/6739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 6ms/step - categorical_accuracy: 0.5729 - loss: 0.9797 - val_categorical_accuracy: 0.5675 - val_loss: 0.9980
Epoch 5/100
[1m6739/6739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 5ms/step - categorical_accuracy: 0.5798 - loss: 0.9633 - val_categorical_accuracy: 0.5686 - val_

KeyboardInterrupt: 

In [None]:
print(hist.history.keys())
print(hist.history.values())

In [None]:
# prompt: plot training progress stored in hist

import matplotlib.pyplot as plt

# Assuming 'hist' is the history object returned by model.fit
plt.plot(hist.history['categorical_accuracy'][5:])
plt.plot(hist.history['val_categorical_accuracy'][5:])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

plt.plot(hist.history['loss'][5:])
plt.plot(hist.history['val_loss'][5:])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()
