## Examples using Biopython

In [122]:
from Bio.Seq import Seq
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd

In [123]:
random_seq = "MKFTKPDRTCEVELFTAAFGLLRWAMQFINIPEAHGEPRLDILGKDEVQPQTQRWRRNIAECKFHVWKSPIVSRCQCKDFLGARFALKKSHILAQLVALELKVPTAAKKRQPVLLESPHFILDNCGTDVVGIIVAIVIDLPVVVAQLAKFKCDTCTDGSESWGGLITMYLVKPGLVKDPLKIRSDYEEVILTAGYDGVQTFSDASSGFDGLEHKYLIENVDNNDFLRAPIMLRLIRKYGGPVLQMLIRPSFLAAGAAKGKLIGQERMIRGRCLEKDFIWSNDGVHSLDCAFCRVLRFGMRYAWNGLMVEIERRLISITDLDMNIQSEKPERKYDLGHISRMNLSPFDGAKGKNLWTRDDAEFKLAQYNDADFSAREGRVSLLSEDKGDSPKLAQRRNEMGNSIAICDENACRLLHMAYLTEKEGTNAKDDVPKVSKGDPPEKHMTNEFKALLVLQLVNDYISGCLVDAKIPIPDVGQFPHFIKMPNKAAGSTLREVHIKI"

In [124]:
protein_sequence = Seq(random_seq)

In [125]:
for i in range(0, len(protein_sequence)):
	# print the count of each base
	print(protein_sequence[i], protein_sequence.count(protein_sequence[i]))

M 14
K 38
F 22
T 17
K 38
P 24
D 36
R 31
T 17
C 13
E 29
V 31
E 29
L 52
F 22
T 17
A 37
A 37
F 22
G 34
L 52
L 52
R 31
W 7
A 37
M 14
Q 16
F 22
I 34
N 19
I 34
P 24
E 29
A 37
H 11
G 34
E 29
P 24
R 31
L 52
D 36
I 34
L 52
G 34
K 38
D 36
E 29
V 31
Q 16
P 24
Q 16
T 17
Q 16
R 31
W 7
R 31
R 31
N 19
I 34
A 37
E 29
C 13
K 38
F 22
H 11
V 31
W 7
K 38
S 25
P 24
I 34
V 31
S 25
R 31
C 13
Q 16
C 13
K 38
D 36
F 22
L 52
G 34
A 37
R 31
F 22
A 37
L 52
K 38
K 38
S 25
H 11
I 34
L 52
A 37
Q 16
L 52
V 31
A 37
L 52
E 29
L 52
K 38
V 31
P 24
T 17
A 37
A 37
K 38
K 38
R 31
Q 16
P 24
V 31
L 52
L 52
E 29
S 25
P 24
H 11
F 22
I 34
L 52
D 36
N 19
C 13
G 34
T 17
D 36
V 31
V 31
G 34
I 34
I 34
V 31
A 37
I 34
V 31
I 34
D 36
L 52
P 24
V 31
V 31
V 31
A 37
Q 16
L 52
A 37
K 38
F 22
K 38
C 13
D 36
T 17
C 13
T 17
D 36
G 34
S 25
E 29
S 25
W 7
G 34
G 34
L 52
I 34
T 17
M 14
Y 10
L 52
V 31
K 38
P 24
G 34
L 52
V 31
K 38
D 36
P 24
L 52
K 38
I 34
R 31
S 25
D 36
Y 10
E 29
E 29
V 31
I 34
L 52
T 17
A 37
G 34
Y 10
D 36
G 34
V 31
Q 16
T 17
F 22

In [126]:
# GC content (%)

def get_gc_content(protein_sequence: str) -> float:
    """Calculate the GC content percentage of a protein sequence."""
    gc_count = protein_sequence.count("G") + protein_sequence.count("C")
    return (gc_count / len(protein_sequence)) * 100

In [127]:
def find_overlap(protein_sequence: str, top_bound: int) -> dict:
    """Find the overlap of sequences with window sizes from 1 to top_bound."""
    overlap_counts = {}
    for window_size in range(1, top_bound + 1):
        for i in range(len(protein_sequence) - window_size + 1):
            window = protein_sequence[i:i + window_size]
            if window in overlap_counts:
                overlap_counts[window] += 1
            else:
                overlap_counts[window] = 1
    return overlap_counts

In [128]:
def get_amino_acid_composition(protein_sequence: str) -> dict:
    """Calculate the counts of each amino acid in the protein sequence."""
    protein_analysis = ProteinAnalysis(protein_sequence)
    amino_acid_composition = protein_analysis.count_amino_acids()
    return amino_acid_composition

In [129]:
def check_motif_presence(protein_sequence: str, motif: str) -> int:
	"""Check the presence of a specific motif in the protein sequence."""
	return 1 if motif in protein_sequence else 0

In [130]:
protein_analysis_features = ["molecular_weight", "aromaticity", "instability_index", "isoelectric_point"]

def add_protein_analysis_features(protein_sequence, protein_analysis_features):
	protein_analysis = ProteinAnalysis(protein_sequence)
	data = {}
	for feature_name in protein_analysis_features:
		function = getattr(protein_analysis, feature_name)
		result = function()
		try:
			data[feature_name] = float(result)
		except ValueError:
			data[feature_name] = result
	return data

In [131]:
# Calculate GC content
gc_content = get_gc_content(protein_sequence)

# Find overlapping sequences
top_bound = 3
overlap_counts = find_overlap(protein_sequence, top_bound)

data = {"GC_content": gc_content}
for key, value in overlap_counts.items():
    data[key] = value

# Calculate amino acid composition
amino_acid_composition = get_amino_acid_composition(protein_sequence)

# Create DataFrame with protein sequence and features
df = pd.DataFrame(data, index=[0])
df["protein_sequence"] = str(protein_sequence)
df["protein_length"] = len(str(protein_sequence))

# Add features for each amino acid count
for amino_acid, count in amino_acid_composition.items():
    feature_name = f"count_{amino_acid}"
    df[feature_name] = count

In [132]:
# motifs

# Check presence of motifs or functional domains
motif1 = "RCE"
motif2 = "LKA"
motif3 = "GKDE"
motif4 = "KPTAAK"

for motif in [motif1, motif2, motif3, motif4]:
	df[f"contains_{motif}"] = check_motif_presence(protein_sequence, motif)

In [133]:
# features for data protein analysis features

features_data_protein_analysis_features = add_protein_analysis_features(protein_sequence, protein_analysis_features)

print(features_data_protein_analysis_features)

for feature_name in protein_analysis_features:
    df[feature_name] = pd.to_numeric(features_data_protein_analysis_features[feature_name], errors='coerce')

{'molecular_weight': 56295.65200000005, 'aromaticity': 0.078, 'instability_index': 43.242200000000004, 'isoelectric_point': 8.299804878234863}


In [134]:
df

Unnamed: 0,GC_content,(M),(K),(F),(T),(P),(D),(R),(C),(E),...,count_W,count_Y,contains_RCE,contains_LKA,contains_GKDE,contains_KPTAAK,molecular_weight,aromaticity,instability_index,isoelectric_point
0,9.4,14,38,22,17,24,36,31,13,29,...,7,10,0,0,1,0,56295.652,0.078,43.2422,8.299805


In [135]:
# save to csv file

df.to_csv('protein_data.csv', index=False)