In [1]:
import pandas as pd

# Load the uploaded CSV file to inspect its structure and contents.
file_path = 'uniprot_sequences.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary of the dataset to understand its structure.
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        1000 non-null   int64 
 1   Sequence  1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


(   ID                                           Sequence
 0   0  MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...
 1   1  MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...
 2   2  MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...
 3   3  MLIGKGSLVMEGQKHLNSKKKGLKASFSLSLTFTSRLAPDPSLVIY...
 4   4  MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,
 None)

In [2]:
import numpy as np

# Define the amino acid order
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
               'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']

# Create a mapping for one-hot encoding
amino_acid_to_index = {aa: i for i, aa in enumerate(amino_acids)}

# Helper function to one-hot encode a single sequence
def one_hot_encode_sequence(sequence, max_length):
    one_hot = np.zeros((max_length, len(amino_acids)), dtype=int)
    for i, letter in enumerate(sequence):
        one_hot[i, amino_acid_to_index.get(letter, amino_acid_to_index['X'])] = 1
    return one_hot.flatten()

# Helper function to calculate letter composition
def calculate_letter_composition(sequence):
    composition = np.zeros(len(amino_acids), dtype=float)
    total_letters = len(sequence)
    for letter in sequence:
        composition[amino_acid_to_index.get(letter, amino_acid_to_index['X'])] += 1
    return composition / total_letters

# Extract the maximum sequence length for padding
max_sequence_length = data['Sequence'].str.len().max()

# Apply feature extraction
data['OneHotEncoded'] = data['Sequence'].apply(lambda seq: one_hot_encode_sequence(seq, max_sequence_length))
data['LetterComposition'] = data['Sequence'].apply(calculate_letter_composition)

# Preview the results
data.head()


Unnamed: 0,ID,Sequence,OneHotEncoded,LetterComposition
0,0,MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.10465116279069768, 0.023255813953488372, 0...."
1,1,MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.11952191235059761, 0.0398406374501992, 0.05..."
2,2,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.06258596973865199, 0.0171939477303989, 0.04..."
3,3,MLIGKGSLVMEGQKHLNSKKKGLKASFSLSLTFTSRLAPDPSLVIY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.0695742471443406, 0.017653167185877467, 0.0..."
4,4,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.08181818181818182, 0.023376623376623377, 0...."


In [5]:
# Prepare final data structure 
features = pd.DataFrame({ 
                         'ID': data['ID'], 
                         'OneHotEncoded': data['OneHotEncoded'], 
                         'LetterComposition': data['LetterComposition'] 
}) 
# Save to CSV or another appropriate format if needed 
features.to_csv('processed_uniprot_sequences.csv', index=False)
data


Unnamed: 0,ID,Sequence,OneHotEncoded,LetterComposition
0,0,MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.10465116279069768, 0.023255813953488372, 0...."
1,1,MERLSAAPVKGQTGPERPSPFSQLVYTNNDSYVIHHGDLRKIHKAA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.11952191235059761, 0.0398406374501992, 0.05..."
2,2,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.06258596973865199, 0.0171939477303989, 0.04..."
3,3,MLIGKGSLVMEGQKHLNSKKKGLKASFSLSLTFTSRLAPDPSLVIY...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.0695742471443406, 0.017653167185877467, 0.0..."
4,4,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.08181818181818182, 0.023376623376623377, 0...."
...,...,...,...,...
995,995,MYIKMATLANGQADNASLSTNGLGSSPGSAGHMNGLSHSPGNPSTI...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.12698412698412698, 0.013888888888888888, 0...."
996,996,MYIKMATLANGQADNASLSTNGLGSSPGSAGHMNGLSHSPGNPSTI...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.13131313131313133, 0.012121212121212121, 0...."
997,997,MYIKMATLANGQADNASLSTNGLGSSPGSAGHMNGLSHSPGNPSTI...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.11042944785276074, 0.012269938650306749, 0...."
998,998,MRRSRSSAAAKLRGQKRSGASGASAAPAASAAAALAPSATRTRRSA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.11636363636363636, 0.02909090909090909, 0.0..."
