In [1]:
! pip install biopython pandas numpy propy3
from google.colab import files
from Bio import SeqIO
from propy import CTD
import pandas as pd
import numpy as np
import glob
import itertools

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting propy3
  Downloading propy3-1.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading propy3-1.1.1-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.3/290.3 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: propy3, biopython
Successfully installed biopython-1.85 propy3-1.1.1


In [2]:
def compute_aac_from_faa(faa_file, output_csv='aac_results.csv'):
    """
    Compute Amino Acid Composition (AAC) from a .faa FASTA file and save as CSV.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
    """
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    def calculate_aac(seq):
        seq = str(seq).upper()
        length = len(seq)
        return {aa: seq.count(aa) / length if length > 0 else 0 for aa in amino_acids}

    # Parse FASTA records
    records = list(SeqIO.parse(faa_file, "fasta"))

    # Calculate AAC for each sequence
    aac_data = []
    for record in records:
        aac = calculate_aac(record.seq)
        aac['ID'] = record.id
        aac_data.append(aac)

    # Convert to DataFrame
    df_aac = pd.DataFrame(aac_data)
    df_aac = df_aac.set_index('ID')

    # Save to CSV
    df_aac.to_csv(output_csv)
    print(f"AAC features saved to {output_csv}")

In [3]:
def compute_dipeptide_composition_from_faa(faa_file, output_csv='dipeptide_results.csv'):
    """
    Compute Dipeptide Composition from a .faa FASTA file and save as CSV.
    Normalized by (L-1) where L is sequence length.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
    """
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    dipeptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]

    def calculate_dipeptide(seq):
        seq = str(seq).upper()
        length = len(seq)
        counts = {dp: 0 for dp in dipeptides}

        for i in range(len(seq) - 1):
            dipep = seq[i:i+2]
            if dipep in counts:
                counts[dipep] += 1

        # Normalize by total number of dipeptides (L-1)
        norm_counts = {dp: counts[dp] / (length - 1) if length > 1 else 0 for dp in dipeptides}
        return norm_counts

    # Parse FASTA records
    records = list(SeqIO.parse(faa_file, "fasta"))

    # Compute dipeptide composition for each sequence
    dipep_data = []
    for record in records:
        dipep = calculate_dipeptide(record.seq)
        dipep['ID'] = record.id
        dipep_data.append(dipep)

    # Convert to DataFrame
    df_dipep = pd.DataFrame(dipep_data)
    df_dipep = df_dipep.set_index('ID')

    # Save to CSV
    df_dipep.to_csv(output_csv)
    print(f"Dipeptide features saved to {output_csv}")

In [4]:
def compute_pseaac_from_faa(faa_file, output_csv='pseaac_results.csv', lamda=5, weight=0.05):
    """
    Compute Pseudo-Amino Acid Composition (PseAAC) from a .faa FASTA file and save as CSV.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
        lamda (int): The number of correlation tiers (λ), default 5.
        weight (float): The weight factor for the sequence-order effect, default 0.05.
    """
    # Hydrophobicity, Hydrophilicity, and Side-chain mass from Chou (2001)
    properties = {
        'hydrophobicity': {'A': 0.62, 'C': 0.29, 'D': -0.90, 'E': -0.74, 'F': 1.19, 'G': 0.48,
                           'H': -0.40, 'I': 1.38, 'K': -1.50, 'L': 1.06, 'M': 0.64, 'N': -0.78,
                           'P': 0.12, 'Q': -0.85, 'R': -2.53, 'S': -0.18, 'T': -0.05, 'V': 1.08,
                           'W': 0.81, 'Y': 0.26},
        'hydrophilicity': {'A': -0.50, 'C': -1.00, 'D': 3.00, 'E': 3.00, 'F': -2.50, 'G': 0.00,
                           'H': -0.50, 'I': -1.80, 'K': 3.00, 'L': -1.80, 'M': -1.30, 'N': 0.20,
                           'P': 0.00, 'Q': 0.20, 'R': 3.00, 'S': 0.30, 'T': -0.40, 'V': -1.50,
                           'W': -3.40, 'Y': -2.30},
        'side_mass': {'A': 15.0, 'C': 47.0, 'D': 59.0, 'E': 73.0, 'F': 91.0, 'G': 1.0,
                      'H': 82.0, 'I': 57.0, 'K': 73.0, 'L': 57.0, 'M': 75.0, 'N': 58.0,
                      'P': 42.0, 'Q': 72.0, 'R': 101.0, 'S': 31.0, 'T': 45.0, 'V': 43.0,
                      'W': 130.0, 'Y': 107.0}
    }

    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    def calculate_theta(seq, lamda):
        seq = seq.upper()
        N = len(seq)
        theta = []
        for l in range(1, lamda+1):
            total = 0
            for i in range(N - l):
                d = 0
                for prop in properties.values():
                    pi = prop.get(seq[i], 0)
                    pj = prop.get(seq[i + l], 0)
                    d += (pi - pj) ** 2
                total += d / len(properties)
            theta.append(total / (N - l) if (N - l) > 0 else 0)
        return theta

    def calculate_pseaac(seq, lamda, weight):
        seq = seq.upper()
        N = len(seq)
        aac = {aa: seq.count(aa) / N if N > 0 else 0 for aa in amino_acids}
        theta = calculate_theta(seq, lamda)
        denom = 1 + weight * sum(theta)
        # Normalize
        features = {}
        for aa in amino_acids:
            features[f"AAC_{aa}"] = aac[aa] / denom
        for i, t in enumerate(theta):
            features[f"Theta_{i+1}"] = (weight * t) / denom
        return features

    records = list(SeqIO.parse(faa_file, "fasta"))
    pseaac_data = []
    for record in records:
        features = calculate_pseaac(record.seq, lamda, weight)
        features['ID'] = record.id
        pseaac_data.append(features)

    df_pseaac = pd.DataFrame(pseaac_data).set_index('ID')
    df_pseaac.to_csv(output_csv)
    print(f"PseAAC features (λ={lamda}, weight={weight}) saved to {output_csv}")

In [5]:
def compute_physicochem_from_faa(faa_file, output_csv='physicochem_results.csv'):
    """
    Compute physicochemical properties (averaged) from a .faa FASTA file and save as CSV.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
    """
    # Property dictionaries
    properties = {
        'Hydrophobicity': {'A':1.8, 'C':2.5, 'D':-3.5, 'E':-3.5, 'F':2.8, 'G':-0.4, 'H':-3.2, 'I':4.5,
                           'K':-3.9, 'L':3.8, 'M':1.9, 'N':-3.5, 'P':-1.6, 'Q':-3.5, 'R':-4.5, 'S':-0.8,
                           'T':-0.7, 'V':4.2, 'W':-0.9, 'Y':-1.3},
        'Hydrophilicity': {'A':-0.5, 'C':-1.0, 'D':3.0, 'E':3.0, 'F':-2.5, 'G':0.0, 'H':-0.5, 'I':-1.8,
                           'K':3.0, 'L':-1.8, 'M':-1.3, 'N':0.2, 'P':0.0, 'Q':0.2, 'R':3.0, 'S':0.3,
                           'T':-0.4, 'V':-1.5, 'W':-3.4, 'Y':-2.3},
        'Side_mass': {'A':15.0, 'C':47.0, 'D':59.0, 'E':73.0, 'F':91.0, 'G':1.0, 'H':82.0, 'I':57.0,
                      'K':73.0, 'L':57.0, 'M':75.0, 'N':58.0, 'P':42.0, 'Q':72.0, 'R':101.0, 'S':31.0,
                      'T':45.0, 'V':43.0, 'W':130.0, 'Y':107.0},
        'Vdw_volume': {'A':67, 'C':86, 'D':91, 'E':109, 'F':135, 'G':48, 'H':118, 'I':124,
                       'K':135, 'L':124, 'M':124, 'N':96, 'P':90, 'Q':114, 'R':148, 'S':73,
                       'T':93, 'V':105, 'W':163, 'Y':141},
        'Polarity': {'A':8.1, 'C':5.5, 'D':13.0, 'E':12.3, 'F':5.2, 'G':9.0, 'H':10.4, 'I':5.2,
                     'K':11.3, 'L':4.9, 'M':5.7, 'N':11.6, 'P':8.0, 'Q':10.5, 'R':10.5, 'S':9.2,
                     'T':8.6, 'V':5.9, 'W':5.4, 'Y':6.2},
        'Polarizability': {'A':0.046, 'C':0.128, 'D':0.105, 'E':0.151, 'F':0.290, 'G':0.000,
                           'H':0.230, 'I':0.186, 'K':0.219, 'L':0.186, 'M':0.221, 'N':0.134,
                           'P':0.131, 'Q':0.180, 'R':0.291, 'S':0.062, 'T':0.108, 'V':0.140,
                           'W':0.409, 'Y':0.298},
        'Solvent_Accessibility': {'A':0.74, 'C':0.91, 'D':0.63, 'E':0.62, 'F':0.88, 'G':0.72, 'H':0.78, 'I':0.88,
                                  'K':0.52, 'L':0.85, 'M':0.85, 'N':0.63, 'P':0.64, 'Q':0.62, 'R':0.64, 'S':0.66,
                                  'T':0.70, 'V':0.86, 'W':0.85, 'Y':0.76},
        'Flexibility': {'A':0.357, 'C':0.346, 'D':0.511, 'E':0.497, 'F':0.314, 'G':0.544, 'H':0.323,
                        'I':0.462, 'K':0.466, 'L':0.365, 'M':0.295, 'N':0.463, 'P':0.509, 'Q':0.493,
                        'R':0.529, 'S':0.507, 'T':0.444, 'V':0.386, 'W':0.305, 'Y':0.420},
        'Isoelectric_Point': {'A':6.00, 'C':5.07, 'D':2.77, 'E':3.22, 'F':5.48, 'G':5.97, 'H':7.59,
                              'I':6.02, 'K':9.74, 'L':5.98, 'M':5.74, 'N':5.41, 'P':6.30, 'Q':5.65,
                              'R':10.76, 'S':5.68, 'T':5.60, 'V':5.96, 'W':5.89, 'Y':5.66},
        'Bulkiness': {'A':11.5, 'C':13.46, 'D':11.68, 'E':13.57, 'F':19.80, 'G':3.40, 'H':13.69,
                      'I':21.40, 'K':15.71, 'L':21.40, 'M':16.25, 'N':12.82, 'P':17.43, 'Q':14.45,
                      'R':14.28, 'S':9.47, 'T':15.77, 'V':21.57, 'W':21.67, 'Y':18.03},
    }

    records = list(SeqIO.parse(faa_file, "fasta"))
    results = []
    for record in records:
        seq = record.seq.upper()
        length = len(seq)
        props = {'ID': record.id}
        for pname, pscale in properties.items():
            avg = sum([pscale.get(aa, 0) for aa in seq]) / length if length > 0 else 0
            props[f"Avg_{pname}"] = avg
        results.append(props)

    df = pd.DataFrame(results).set_index('ID')
    df.to_csv(output_csv)
    print(f"Physicochemical property averages saved to {output_csv}")

In [6]:
def compute_shannon_entropy_faa(faa_file, output_csv='shannon_entropy_results.csv'):
    """
    Compute Shannon entropy per sequence from a .faa FASTA file and save as CSV.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
    """
    records = list(SeqIO.parse(faa_file, "fasta"))
    results = []

    for record in records:
        seq = str(record.seq.upper())
        length = len(seq)
        if length == 0:
            entropy = 0
        else:
            aa_counts = {}
            for aa in seq:
                aa_counts[aa] = aa_counts.get(aa, 0) + 1
            probs = np.array([count / length for count in aa_counts.values()])
            entropy = -np.sum(probs * np.log2(probs))

        results.append({'ID': record.id, 'Shannon_Entropy': entropy})

    df = pd.DataFrame(results).set_index('ID')
    df.to_csv(output_csv)
    print(f"Shannon entropy values saved to {output_csv}")

In [7]:
def compute_ctd_features(faa_file, output_csv='ctd_results.csv'):
    """
    Compute CTD (Composition, Transition, Distribution) features from a .faa FASTA file and save as CSV.

    Args:
        faa_file (str): Path to the input .faa file.
        output_csv (str): Path to the output CSV file.
    """
    records = list(SeqIO.parse(faa_file, "fasta"))
    ctd_data = []

    for record in records:
        seq = str(record.seq).upper()
        features = CTD.CalculateCTD(seq)
        features['ID'] = record.id
        ctd_data.append(features)

    df_ctd = pd.DataFrame(ctd_data).set_index('ID')
    df_ctd.to_csv(output_csv)
    print(f"CTD features saved to {output_csv}")

In [8]:
# Z-scale values from Sandberg et al., 1998
zscale = {
    'A': [0.24, -2.32, 0.60, -0.14, 1.30],
    'C': [0.84, -1.67, 3.71, 0.18, -2.65],
    'D': [3.98, 1.93, 1.93, -2.46, 0.75],
    'E': [3.11, 1.53, 1.67, -1.89, 1.37],
    'F': [-4.22, -1.03, -1.12, 1.60, 0.94],
    'G': [2.03, 2.06, 0.93, 0.27, 0.42],
    'H': [2.05, 1.77, -0.98, -2.05, -1.90],
    'I': [-1.46, -2.09, -0.40, 1.57, -1.57],
    'K': [2.29, 2.23, 0.93, -1.56, -0.78],
    'L': [-1.71, -1.91, -0.19, 1.14, -1.80],
    'M': [-0.27, -1.59, 0.87, 0.74, -2.17],
    'N': [2.06, 1.28, 0.87, -2.23, -0.04],
    'P': [1.95, 0.15, 0.15, 0.25, -2.53],
    'Q': [1.75, 0.73, 0.73, -1.93, -0.52],
    'R': [3.04, 2.23, 1.58, -1.53, 1.50],
    'S': [1.28, 0.67, 0.67, -0.90, -1.52],
    'T': [0.02, 0.12, 0.12, -0.25, -1.79],
    'V': [-0.63, -2.27, -0.35, 1.14, -0.79],
    'W': [-4.36, -0.99, -1.24, 1.71, 1.07],
    'Y': [-2.54, -0.77, -0.71, 1.11, 0.20]
}

def compute_zscale_from_faa(faa_file, output_csv='zscale_results.csv'):
    """
    Compute Z-Scale (Z1-Z5) averages for protein sequences in a .faa file.

    Args:
        faa_file (str): Path to the input .faa FASTA file.
        output_csv (str): Path to the output CSV file.
    """
    records = list(SeqIO.parse(faa_file, "fasta"))
    zscale_data = []

    for record in records:
        seq = str(record.seq).upper()
        z_values = np.array([zscale.get(aa, [0]*5) for aa in seq if aa in zscale])

        if len(z_values) == 0:
            avg_z = [0]*5
        else:
            avg_z = np.mean(z_values, axis=0).tolist()

        features = {f"Z{i+1}": avg for i, avg in enumerate(avg_z)}
        features['ID'] = record.id
        zscale_data.append(features)

    df_zscale = pd.DataFrame(zscale_data).set_index('ID')
    df_zscale.to_csv(output_csv)
    print(f"Z-Scale features saved to {output_csv}")

In [9]:
def merge_feature_csvs(csv_files, output_csv="merged_features.csv"):
    """
    Merge multiple CSV feature files by 'ID' column.

    Args:
        csv_files (list): List of CSV file paths to merge.
        output_csv (str): Path for the final merged CSV.
    """
    merged_df = None

    for file in csv_files:
        df = pd.read_csv(file)

        if 'ID' not in df.columns:
            df.reset_index(inplace=True)
            df = df.rename(columns={'index': 'ID'})

        if merged_df is None:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='ID', how='outer')

    merged_df.to_csv(output_csv, index=False)
    print(f"Merged features saved to: {output_csv}")

In [None]:
# ============================================
# Example Usage (Google Colab)
# ============================================

# 1️⃣ Upload your .faa file in Colab:
# from google.colab import files
# uploaded = files.upload()
# faa_file = list(uploaded.keys())[0]

# 2️⃣ Run feature extraction functions:
# compute_aac_from_faa(faa_file, output_csv='aac_results.csv')
# compute_dipeptide_composition_from_faa(faa_file, output_csv='dipeptide_results.csv')
# compute_pseaac_from_faa(faa_file, output_csv='pseaac_results.csv', lamda=5, weight=0.05)
# compute_physicochem_from_faa(faa_file, output_csv='physicochem_results.csv')
# compute_shannon_entropy_faa(faa_file, output_csv='shannon_entropy_results.csv')
# compute_ctd_features(faa_file, output_csv='ctd_results.csv')
# compute_zscale_from_faa(faa_file, output_csv='zscale_results.csv')

# 3️⃣ Merge all CSVs into a single feature file:
# csv_files = [
#     'aac_results.csv',
#     'dipeptide_results.csv',
#     'pseaac_results.csv',
#     'physicochem_results.csv',
#     'shannon_entropy_results.csv',
#     'ctd_results.csv',
#     'zscale_results.csv'
# ]
# merge_feature_csvs(csv_files, output_csv="merged_features.csv")

# 4️⃣ Download the merged features file:
# files.download("merged_features.csv")

# ============================================