In [1]:
# Install BioPython package
!pip install biopython



In [2]:
# Do imports
import pandas as pd
import numpy as np
import Bio.PDB

In [3]:
# Define onehot encoding functions
def onehot_encode_aa(sequence):
    """ Converts string-format amino acid sequence to one-hot encoded format """
    aa_indices = []
    for aa in sequence:
         try:
             aa_index = Bio.PDB.Polypeptide.one_to_index(aa)
         except:
             aa_index = 20 # account for non-standard aa
         aa_indices.append(aa_index)
    
    sequence_onehot = np.zeros((len(aa_indices), 21))
    sequence_onehot[np.arange(len(aa_indices)), aa_indices] = 1
    return sequence_onehot

def onehot_encode_ss(sequence):
    """ Converts string-format secondary structure sequence to one-hot encoded format """
    ss_dict = {"-": 0, "E": 1, "T": 2, "S": 3, "H": 4}
    ss_indices = []
    for ss in sequence:
         try:
             ss_index = ss_dict[ss]
         except:
             aa_index = 20 # account for non-standard aa
         ss_indices.append(ss_index)
    
    sequence_onehot = np.zeros((len(ss_indices), len(ss_dict)))
    sequence_onehot[np.arange(len(ss_indices)), ss_indices] = 1
    return sequence_onehot

In [4]:
# Load data 
data = pd.read_csv("single_muts_train.csv")
data = data.drop(data.columns[0], axis=1) # we dont need the first column

# One hot encode aa and ss sequences
data.insert(2, "sequence_onehot", data['sequence'].apply(lambda x: onehot_encode_aa(x)))
data.insert(4, "secondary_structure_onehot", data['secondary_structure'].apply(lambda x: onehot_encode_ss(x)))

# Print example of one hot endocing
print("This is a string-format protein sequence:")
print(data["sequence"].iloc[0])
print("\n")
print("This is a one-hot encoded protein sequence:")
print(data["sequence_onehot"].iloc[0])

This is a string-format protein sequence:
GSRHVKVNGTTYEATTDEEAKKYAKKAGAKTVKVENGELQSHG


This is a one-hot encoded protein sequence:
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0.

In [5]:
# Optional: Flatten onehot encodings
data.insert(3, "sequence_onehot_flat", data['sequence_onehot'].apply(lambda x: x.flatten()))
data.insert(6, "secondary_structure_onehot_flat", data['secondary_structure_onehot'].apply(lambda x: x.flatten()))

# Print example of flat one hot endocing
print("This is a flat one-hot encoded protein sequence:")
print(data["secondary_structure_onehot_flat"].iloc[0])

This is a flat one-hot encoded protein sequence:
[1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.]
