In [1]:
!git clone https://github.com/Superzchen/iFeature.git

fatal: destination path 'iFeature' already exists and is not an empty directory.


In [2]:
%pip install rdkit



In [3]:
!pip install biopython==1.84



In [4]:
!pip install pandas numpy



In [5]:
!pip install scikit-learn==1.4.2



In [6]:
def convert_to_fasta(sequence, filename="peptide.fasta", header=">sequence"):
    lines = sequence.strip().split("\n")
    if lines[0].startswith(">"):
        fasta_content = sequence.strip()
    else:
        fasta_content = f"{header}\n{sequence.strip()}"
    with open(filename, "w") as fasta_file:
        fasta_file.write(fasta_content)

    return filename
user_sequence = input("Enter the sequence: ").strip()
saved_file = convert_to_fasta(user_sequence, "peptide.fasta")
print(f"FASTA file saved as: {saved_file}")


Enter the sequence: MLCYPRADQ
FASTA file saved as: peptide.fasta


In [7]:
import subprocess
import pandas as pd
import os
from Bio import SeqIO
from rdkit.Chem import Descriptors, MolFromSmiles
from rdkit import Chem
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO

def extract_ifeatures(input_file: str):
    feature_types = ['AAC', 'DPC', 'CTDC', 'CTDD', 'CTDT']
    feature_dfs = []

    for feature_type in feature_types:
        csv_output_file = f"{os.path.splitext(input_file)[0]}_{feature_type.lower()}.csv"
        command = f"python3 /content/iFeature/iFeature.py --file {input_file} --type {feature_type} --out {csv_output_file}"
        subprocess.run(command, shell=True, check=True)

        try:
            feature_df = pd.read_csv(csv_output_file, header=0, index_col=False, sep="\t")
        except pd.errors.ParserError:
            feature_df = pd.read_csv(csv_output_file, header=0, index_col=False, sep=",")

        feature_dfs.append(feature_df)
        os.remove(csv_output_file)

    final_df = feature_dfs[0]
    for df in feature_dfs[1:]:
        final_df = pd.merge(final_df, df, how='inner', left_on=final_df.columns[0], right_on=df.columns[0])

    final_df = final_df.drop(columns=[final_df.columns[0]])
    return final_df

def extract_peptide_descriptors(input_fasta):
    sequences = [str(record.seq) for record in SeqIO.parse(input_fasta, "fasta")]
    all_descriptors = pd.DataFrame()

    for sequence in sequences:
        try:
            # Convert peptide sequence to SMILES string
            smiles_string = Chem.MolToSmiles(Chem.MolFromSequence(sequence))
            mol = MolFromSmiles(smiles_string)

            # Calculate RDKit descriptors
            desc_values = {desc_name: descriptor(mol) if mol else None
                           for desc_name, descriptor in Descriptors._descList}
            all_descriptors = pd.concat([all_descriptors, pd.DataFrame(desc_values, index=[0])], ignore_index=True)
        except Exception as e:
            print(f"Error processing sequence {sequence}: {e}")

    return all_descriptors

def generate_protparam_features(fasta_file):
    peptides = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    results = []

    for peptide in peptides:
        analysis = ProteinAnalysis(peptide)
        results.append({
            'Number of Amino Acids': len(peptide),
            'Molecular Weight': analysis.molecular_weight(),
            'Aromaticity': analysis.aromaticity(),
            'GRAVY': analysis.gravy(),
            'Isoelectric Point': analysis.isoelectric_point(),
            'Charge at pH 7': analysis.charge_at_pH(pH=7),
            'Alpha-Helix Fraction': analysis.secondary_structure_fraction()[0],
            'Beta-Sheet Fraction': analysis.secondary_structure_fraction()[2],
            'Coil Fraction': analysis.secondary_structure_fraction()[1],
            'Molar Extinction Coefficient (Reduced Cysteines)': analysis.molar_extinction_coefficient()[0],
            'Molar Extinction Coefficient (Oxidized Cysteines)': analysis.molar_extinction_coefficient()[1]
        })

    return pd.DataFrame(results)

def calculate_atomic_composition(fasta_file):
    def atomic_composition(protein):
        atomic_weights = {
            'A': (3, 7, 2, 1, 0), 'R': (6, 14, 2, 4, 0), 'N': (4, 8, 3, 2, 0), 'D': (4, 7, 4, 1, 0),
            'C': (3, 7, 2, 1, 1), 'Q': (5, 10, 3, 2, 0), 'E': (5, 9, 4, 1, 0), 'G': (2, 5, 2, 1, 0),
            'H': (6, 9, 2, 3, 0), 'I': (6, 13, 2, 1, 0), 'L': (6, 13, 2, 1, 0), 'K': (6, 14, 2, 2, 0),
            'M': (5, 11, 2, 1, 1), 'F': (9, 11, 2, 1, 0), 'P': (5, 9, 2, 1, 0), 'S': (3, 7, 3, 1, 0),
            'T': (4, 9, 3, 1, 0), 'W': (11, 12, 2, 2, 0), 'Y': (9, 11, 3, 1, 0), 'V': (5, 11, 2, 1, 0)
        }
        c = h = o = n = s = 0
        for aa in protein:
            if aa in atomic_weights:
                c_add, h_add, o_add, n_add, s_add = atomic_weights[aa]
                c += c_add; h += h_add; o += o_add; n += n_add; s += s_add
        return c, h, o, n, s

    results = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq)
        composition = atomic_composition(sequence)
        results.append(list(composition))

    return pd.DataFrame(results, columns=["C", "H", "O", "N", "S"])

def generate_combined_features(input_fasta):
    # Extract individual features
    feature_df1 = extract_ifeatures(input_fasta)
    feature_df2 = extract_peptide_descriptors(input_fasta)  # This is the corrected line
    feature_df3 = generate_protparam_features(input_fasta)
    feature_df4 = calculate_atomic_composition(input_fasta)

    # Merge the feature dataframes
    final_df = pd.concat([feature_df1, feature_df2, feature_df3, feature_df4], axis=1)

    # Save the final combined dataframe to an Excel file
    base_name = os.path.splitext(os.path.basename(input_fasta))[0]
    output_file = f"{base_name}_combinedfeatures.xlsx"
    final_df.to_excel(output_file, index=False)

    print(f"Combined features saved to {output_file}")

# Example usage:
generate_combined_features("peptide.fasta")


Combined features saved to peptide_combinedfeatures.xlsx


In [8]:
import pandas as pd
import os

def filter_columns_by_headers(file_path, headers_to_keep, output_file):
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'.")
        return

    # Read the Excel file
    df = pd.read_excel(file_path)

    # Retain only the columns in the provided list
    filtered_df = df[headers_to_keep]

    # Save the filtered DataFrame to a new Excel file
    filtered_df.to_excel(output_file, index=False)
    print(f"Filtered file saved as: {output_file}")

# Example usage
file_path = "peptide_combinedfeatures.xlsx"  # Correct your file path

headers_to_keep = ['Number of Amino Acids', 'Molecular Weight', 'Aromaticity', 'Isoelectric Point', 'Charge at pH 7', 'Alpha-Helix Fraction', 'Beta-Sheet Fraction', 'Coil Fraction', 'Molar Extinction Coefficient (Reduced Cysteines)', 'Molar Extinction Coefficient (Oxidized Cysteines)', 'C', 'H', 'O', 'N', 'S', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA4', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SlogP_VSA1', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP', 'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_Ar_COO', 'fr_Ar_N', 'fr_C_O', 'fr_C_O_noCOO', 'fr_NH1', 'fr_NH2', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_benzene', 'fr_ester', 'fr_guanido', 'fr_nitro', 'fr_nitro_arom', 'fr_sulfide', 'fr_sulfonamd', 'A', 'C.1', 'D', 'E', 'F', 'G', 'H.1', 'I', 'K', 'L', 'M', 'N.1', 'P', 'Q', 'R', 'S.1', 'T', 'V', 'W', 'Y', 'AK', 'AV', 'AW', 'CA', 'CC', 'CE', 'CG', 'CR', 'CS', 'CT', 'CV', 'CY', 'DC', 'DG', 'DY', 'EC', 'EI', 'ER', 'FI', 'FK', 'FL', 'GC', 'GG', 'GR', 'GS', 'GW', 'HC', 'IF', 'IR', 'IY', 'KA', 'KI', 'KK', 'KL', 'KV', 'KW', 'LA', 'LC', 'LF', 'LK', 'LL', 'LR', 'MF', 'MK', 'MS', 'MV', 'NI', 'NR', 'PC', 'PE', 'PG', 'RC', 'RG', 'RM', 'RP', 'RR', 'RW', 'SE', 'SG', 'SI', 'SN', 'ST', 'VA', 'VT', 'WK', 'WQ', 'WR', 'YG', 'YQ', 'YS', 'hydrophobicity_PRAM900101.G1', 'hydrophobicity_PRAM900101.G2', 'hydrophobicity_PRAM900101.G3', 'hydrophobicity_ARGP820101.G1', 'hydrophobicity_ARGP820101.G2', 'hydrophobicity_ARGP820101.G3', 'hydrophobicity_ZIMJ680101.G1', 'hydrophobicity_ZIMJ680101.G2', 'hydrophobicity_ZIMJ680101.G3', 'hydrophobicity_PONP930101.G1', 'hydrophobicity_PONP930101.G2', 'hydrophobicity_PONP930101.G3', 'hydrophobicity_CASG920101.G1', 'hydrophobicity_CASG920101.G2', 'hydrophobicity_CASG920101.G3', 'hydrophobicity_ENGD860101.G1', 'hydrophobicity_ENGD860101.G2', 'hydrophobicity_ENGD860101.G3', 'hydrophobicity_FASG890101.G1', 'hydrophobicity_FASG890101.G2', 'hydrophobicity_FASG890101.G3', 'normwaalsvolume.G1', 'normwaalsvolume.G2', 'normwaalsvolume.G3', 'polarity.G1', 'polarity.G2', 'polarity.G3', 'polarizability.G1', 'polarizability.G2', 'polarizability.G3', 'charge.G1', 'charge.G2', 'charge.G3', 'secondarystruct.G1', 'secondarystruct.G2', 'secondarystruct.G3', 'solventaccess.G1', 'solventaccess.G2', 'solventaccess.G3', 'hydrophobicity_PRAM900101.1.residue0', 'hydrophobicity_PRAM900101.1.residue25', 'hydrophobicity_PRAM900101.1.residue50', 'hydrophobicity_PRAM900101.1.residue75', 'hydrophobicity_PRAM900101.1.residue100', 'hydrophobicity_PRAM900101.2.residue0', 'hydrophobicity_PRAM900101.2.residue25', 'hydrophobicity_PRAM900101.2.residue50', 'hydrophobicity_PRAM900101.2.residue75', 'hydrophobicity_PRAM900101.2.residue100', 'hydrophobicity_PRAM900101.3.residue0', 'hydrophobicity_PRAM900101.3.residue25', 'hydrophobicity_PRAM900101.3.residue50', 'hydrophobicity_PRAM900101.3.residue75', 'hydrophobicity_PRAM900101.3.residue100', 'hydrophobicity_ARGP820101.1.residue0', 'hydrophobicity_ARGP820101.1.residue25', 'hydrophobicity_ARGP820101.1.residue50', 'hydrophobicity_ARGP820101.1.residue75', 'hydrophobicity_ARGP820101.1.residue100', 'hydrophobicity_ARGP820101.2.residue0', 'hydrophobicity_ARGP820101.2.residue25', 'hydrophobicity_ARGP820101.2.residue50', 'hydrophobicity_ARGP820101.2.residue75', 'hydrophobicity_ARGP820101.2.residue100', 'hydrophobicity_ARGP820101.3.residue0', 'hydrophobicity_ARGP820101.3.residue25', 'hydrophobicity_ARGP820101.3.residue50', 'hydrophobicity_ARGP820101.3.residue75', 'hydrophobicity_ARGP820101.3.residue100', 'hydrophobicity_ZIMJ680101.1.residue0', 'hydrophobicity_ZIMJ680101.1.residue25', 'hydrophobicity_ZIMJ680101.1.residue50', 'hydrophobicity_ZIMJ680101.1.residue75', 'hydrophobicity_ZIMJ680101.1.residue100', 'hydrophobicity_ZIMJ680101.2.residue0', 'hydrophobicity_ZIMJ680101.2.residue25', 'hydrophobicity_ZIMJ680101.2.residue50', 'hydrophobicity_ZIMJ680101.2.residue75', 'hydrophobicity_ZIMJ680101.2.residue100', 'hydrophobicity_ZIMJ680101.3.residue0', 'hydrophobicity_ZIMJ680101.3.residue25', 'hydrophobicity_ZIMJ680101.3.residue50', 'hydrophobicity_ZIMJ680101.3.residue75', 'hydrophobicity_ZIMJ680101.3.residue100', 'hydrophobicity_PONP930101.1.residue0', 'hydrophobicity_PONP930101.1.residue25', 'hydrophobicity_PONP930101.1.residue50', 'hydrophobicity_PONP930101.1.residue75', 'hydrophobicity_PONP930101.1.residue100', 'hydrophobicity_PONP930101.2.residue0', 'hydrophobicity_PONP930101.2.residue25', 'hydrophobicity_PONP930101.2.residue50', 'hydrophobicity_PONP930101.2.residue75', 'hydrophobicity_PONP930101.2.residue100', 'hydrophobicity_PONP930101.3.residue0', 'hydrophobicity_PONP930101.3.residue25', 'hydrophobicity_PONP930101.3.residue50', 'hydrophobicity_PONP930101.3.residue75', 'hydrophobicity_PONP930101.3.residue100', 'hydrophobicity_CASG920101.1.residue0', 'hydrophobicity_CASG920101.1.residue25', 'hydrophobicity_CASG920101.1.residue50', 'hydrophobicity_CASG920101.1.residue75', 'hydrophobicity_CASG920101.1.residue100', 'hydrophobicity_CASG920101.2.residue0', 'hydrophobicity_CASG920101.2.residue25', 'hydrophobicity_CASG920101.2.residue50', 'hydrophobicity_CASG920101.2.residue75', 'hydrophobicity_CASG920101.2.residue100', 'hydrophobicity_CASG920101.3.residue0', 'hydrophobicity_CASG920101.3.residue25', 'hydrophobicity_CASG920101.3.residue50', 'hydrophobicity_CASG920101.3.residue75', 'hydrophobicity_CASG920101.3.residue100', 'hydrophobicity_ENGD860101.1.residue0', 'hydrophobicity_ENGD860101.1.residue25', 'hydrophobicity_ENGD860101.1.residue50', 'hydrophobicity_ENGD860101.1.residue75', 'hydrophobicity_ENGD860101.1.residue100', 'hydrophobicity_ENGD860101.2.residue0', 'hydrophobicity_ENGD860101.2.residue25', 'hydrophobicity_ENGD860101.2.residue50', 'hydrophobicity_ENGD860101.2.residue75', 'hydrophobicity_ENGD860101.2.residue100', 'hydrophobicity_ENGD860101.3.residue0', 'hydrophobicity_ENGD860101.3.residue25', 'hydrophobicity_ENGD860101.3.residue50', 'hydrophobicity_ENGD860101.3.residue75', 'hydrophobicity_ENGD860101.3.residue100', 'hydrophobicity_FASG890101.1.residue0', 'hydrophobicity_FASG890101.1.residue25', 'hydrophobicity_FASG890101.1.residue50', 'hydrophobicity_FASG890101.1.residue75', 'hydrophobicity_FASG890101.1.residue100', 'hydrophobicity_FASG890101.2.residue0', 'hydrophobicity_FASG890101.2.residue25', 'hydrophobicity_FASG890101.2.residue50', 'hydrophobicity_FASG890101.2.residue75', 'hydrophobicity_FASG890101.2.residue100', 'hydrophobicity_FASG890101.3.residue0', 'hydrophobicity_FASG890101.3.residue25', 'hydrophobicity_FASG890101.3.residue50', 'hydrophobicity_FASG890101.3.residue75', 'hydrophobicity_FASG890101.3.residue100', 'normwaalsvolume.1.residue0', 'normwaalsvolume.1.residue25', 'normwaalsvolume.1.residue50', 'normwaalsvolume.1.residue75', 'normwaalsvolume.1.residue100', 'normwaalsvolume.2.residue0', 'normwaalsvolume.2.residue25', 'normwaalsvolume.2.residue50', 'normwaalsvolume.2.residue75', 'normwaalsvolume.2.residue100', 'normwaalsvolume.3.residue0', 'normwaalsvolume.3.residue25', 'normwaalsvolume.3.residue50', 'normwaalsvolume.3.residue75', 'normwaalsvolume.3.residue100', 'polarity.1.residue0', 'polarity.1.residue25', 'polarity.1.residue50', 'polarity.1.residue75', 'polarity.1.residue100', 'polarity.2.residue0', 'polarity.2.residue25', 'polarity.2.residue50', 'polarity.2.residue75', 'polarity.2.residue100', 'polarity.3.residue0', 'polarity.3.residue25', 'polarity.3.residue50', 'polarity.3.residue75', 'polarity.3.residue100', 'polarizability.1.residue0', 'polarizability.1.residue25', 'polarizability.1.residue50', 'polarizability.1.residue75', 'polarizability.1.residue100', 'polarizability.2.residue0', 'polarizability.2.residue25', 'polarizability.2.residue50', 'polarizability.2.residue75', 'polarizability.2.residue100', 'polarizability.3.residue0', 'polarizability.3.residue25', 'polarizability.3.residue50', 'polarizability.3.residue75', 'polarizability.3.residue100', 'charge.1.residue0', 'charge.1.residue25', 'charge.1.residue50', 'charge.1.residue75', 'charge.1.residue100', 'charge.2.residue0', 'charge.2.residue25', 'charge.2.residue50', 'charge.2.residue75', 'charge.2.residue100', 'charge.3.residue0', 'charge.3.residue25', 'charge.3.residue50', 'charge.3.residue75', 'charge.3.residue100', 'secondarystruct.1.residue0', 'secondarystruct.1.residue25', 'secondarystruct.1.residue50', 'secondarystruct.1.residue75', 'secondarystruct.1.residue100', 'secondarystruct.2.residue0', 'secondarystruct.2.residue25', 'secondarystruct.2.residue50', 'secondarystruct.2.residue75', 'secondarystruct.2.residue100', 'secondarystruct.3.residue0', 'secondarystruct.3.residue25', 'secondarystruct.3.residue50', 'secondarystruct.3.residue75', 'secondarystruct.3.residue100', 'solventaccess.1.residue0', 'solventaccess.1.residue25', 'solventaccess.1.residue50', 'solventaccess.1.residue75', 'solventaccess.1.residue100', 'solventaccess.2.residue0', 'solventaccess.2.residue25', 'solventaccess.2.residue50', 'solventaccess.2.residue75', 'solventaccess.2.residue100', 'solventaccess.3.residue0', 'solventaccess.3.residue25', 'solventaccess.3.residue50', 'solventaccess.3.residue75', 'solventaccess.3.residue100', 'hydrophobicity_PRAM900101.Tr1221', 'hydrophobicity_PRAM900101.Tr1331', 'hydrophobicity_PRAM900101.Tr2332', 'hydrophobicity_ARGP820101.Tr1221', 'hydrophobicity_ARGP820101.Tr1331', 'hydrophobicity_ARGP820101.Tr2332', 'hydrophobicity_ZIMJ680101.Tr1221', 'hydrophobicity_ZIMJ680101.Tr1331', 'hydrophobicity_ZIMJ680101.Tr2332', 'hydrophobicity_PONP930101.Tr1221', 'hydrophobicity_PONP930101.Tr1331', 'hydrophobicity_PONP930101.Tr2332', 'hydrophobicity_CASG920101.Tr1221', 'hydrophobicity_CASG920101.Tr1331', 'hydrophobicity_CASG920101.Tr2332', 'hydrophobicity_ENGD860101.Tr1221', 'hydrophobicity_ENGD860101.Tr1331', 'hydrophobicity_ENGD860101.Tr2332', 'hydrophobicity_FASG890101.Tr1221', 'hydrophobicity_FASG890101.Tr1331', 'hydrophobicity_FASG890101.Tr2332', 'normwaalsvolume.Tr1221', 'normwaalsvolume.Tr1331', 'normwaalsvolume.Tr2332', 'polarity.Tr1221', 'polarity.Tr1331', 'polarity.Tr2332', 'polarizability.Tr1221', 'polarizability.Tr1331', 'polarizability.Tr2332', 'charge.Tr1221', 'charge.Tr1331', 'charge.Tr2332', 'secondarystruct.Tr1221', 'secondarystruct.Tr1331', 'secondarystruct.Tr2332', 'solventaccess.Tr1221', 'solventaccess.Tr1331', 'solventaccess.Tr2332']
# Replace with the headers you want to retain
output_file = "peptide_eliminated.xlsx"

filter_columns_by_headers(file_path, headers_to_keep, output_file)
print(f"Selected features saved to {output_file}")

Filtered file saved as: peptide_eliminated.xlsx
Selected features saved to peptide_eliminated.xlsx


In [11]:
import pandas as pd
import os

def filter_columns_by_headers(file_path, headers_to_keep, output_file):
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'.")
        return

    # Read the Excel file
    df = pd.read_excel(file_path)

    # Retain only the columns in the provided list
    filtered_df = df[headers_to_keep]

    # Save the filtered DataFrame to a new Excel file
    filtered_df.to_excel(output_file, index=False)
    print(f"Filtered file saved as: {output_file}")

# Example usage
file_path = "peptide_eliminated.xlsx"  # Correct your file path
headers_to_keep = ['SPS', 'FpDensityMorgan1', 'FpDensityMorgan2', 'BCUT2D_MWLOW', 'BCUT2D_CHGLO', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'Chi1', 'Chi2n', 'Chi2v', 'Kappa2', 'Kappa3', 'PEOE_VSA6', 'PEOE_VSA7', 'SMR_VSA4', 'SMR_VSA5', 'SlogP_VSA1', 'SlogP_VSA4', 'SlogP_VSA5', 'EState_VSA10', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA9', 'VSA_EState2', 'VSA_EState4', 'FractionCSP3', 'NumRotatableBonds', 'MolLogP', 'fr_NH2', 'A', 'C.1', 'D', 'E', 'F', 'G', 'H.1', 'I', 'K', 'L', 'M', 'N.1', 'P', 'Q', 'R', 'S.1', 'T', 'V', 'W', 'Y', 'hydrophobicity_PRAM900101.G1', 'hydrophobicity_PRAM900101.G2', 'hydrophobicity_PONP930101.G1', 'hydrophobicity_PONP930101.G3', 'hydrophobicity_CASG920101.G3', 'normwaalsvolume.G1', 'solventaccess.G3', 'hydrophobicity_PRAM900101.3.residue75', 'hydrophobicity_ARGP820101.3.residue0', 'hydrophobicity_ZIMJ680101.1.residue50', 'hydrophobicity_ZIMJ680101.1.residue75', 'hydrophobicity_ZIMJ680101.1.residue100', 'hydrophobicity_PONP930101.2.residue25', 'hydrophobicity_PONP930101.3.residue25', 'hydrophobicity_PONP930101.3.residue50', 'hydrophobicity_PONP930101.3.residue75', 'hydrophobicity_PONP930101.3.residue100', 'hydrophobicity_CASG920101.3.residue0', 'hydrophobicity_CASG920101.3.residue25', 'hydrophobicity_CASG920101.3.residue50', 'hydrophobicity_CASG920101.3.residue100', 'hydrophobicity_ENGD860101.2.residue100', 'hydrophobicity_ENGD860101.3.residue0', 'hydrophobicity_ENGD860101.3.residue50', 'hydrophobicity_ENGD860101.3.residue75', 'hydrophobicity_FASG890101.3.residue0', 'polarity.1.residue75', 'polarity.1.residue100', 'polarizability.1.residue0', 'charge.1.residue75', 'charge.2.residue75', 'secondarystruct.2.residue75', 'secondarystruct.3.residue75', 'solventaccess.1.residue50', 'solventaccess.1.residue75', 'solventaccess.2.residue0', 'hydrophobicity_PRAM900101.Tr1331', 'hydrophobicity_PRAM900101.Tr2332', 'hydrophobicity_ARGP820101.Tr2332', 'hydrophobicity_PONP930101.Tr1331', 'hydrophobicity_PONP930101.Tr2332', 'hydrophobicity_CASG920101.Tr1221', 'hydrophobicity_CASG920101.Tr2332', 'hydrophobicity_ENGD860101.Tr2332', 'hydrophobicity_FASG890101.Tr1331', 'normwaalsvolume.Tr1331', 'normwaalsvolume.Tr2332', 'polarity.Tr1331', 'polarizability.Tr2332', 'secondarystruct.Tr1221']# Replace with the headers you want to retain
output_file = "peptide.xlsx"

filter_columns_by_headers(file_path, headers_to_keep, output_file)


Filtered file saved as: peptide.xlsx


In [12]:
!pip install scikit-learn==1.6.1

Collecting scikit-learn==1.6.1
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.2
    Uninstalling scikit-learn-1.4.2:
      Successfully uninstalled scikit-learn-1.4.2
Successfully installed scikit-learn-1.6.1


Loading models

In [14]:
import joblib

gnb = joblib.load("gnb_model.pkl")
rf_model = joblib.load("rf_model.pkl")
scaler_rf = joblib.load("scaler_rf.pkl")
ab_model = joblib.load("adaboost_model.pkl")
scaler_ab = joblib.load("scaler_ab.pkl")
et_model = joblib.load("extratrees_model.pkl")
scaler_et = joblib.load("scaler_et.pkl")

In [15]:
import torch
import torch.nn as nn

class EnhancedANN(nn.Module):
    def __init__(self, input_dim):
        super(EnhancedANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

# Define the model path
model_path = "ann_model.pth"  # Ensure this is the correct path

input_dim = 100

# Initialize the model
ann_model = EnhancedANN(input_dim)

# Load the saved model weights
ann_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Set the model to evaluation mode
ann_model.eval()

print("Model successfully loaded!")


Model successfully loaded!


  ann_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


In [16]:
import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, input_channels, input_dim):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(32, 16, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(16 * input_dim, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

# Define the model path in Google Drive
model_path = "cnn_model.pth"  # Adjust the path as needed

# Specify the input dimensions (must match the trained model's input)
input_channels = 1
input_dim = 100  # Replace with the actual number of features used during training

# Initialize the model with the correct input shape
cnn_model = CNN(input_channels, input_dim)

# Load the saved weights
cnn_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
cnn_model.eval()  # Set model to evaluation mode

print("Model successfully loaded!")


Model successfully loaded!


  cnn_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


Prediction using various models

In [18]:
df = pd.read_excel("peptide.xlsx")

In [26]:
X_gnb = df.values
gnb_prob = gnb.predict_proba(X_gnb)[:, 1]  # Probability of class 1
gnb_pred = gnb.predict(X_gnb)
print("Predictions from Gaussian Naive Bayes:", gnb_pred)

X_rf = scaler_rf.transform(df)
rf_prob = rf_model.predict_proba(X_rf)[:, 1]
rf_pred = rf_model.predict(X_rf)
print("Predictions from Random Forest:", rf_pred)


X_ab = scaler_ab.transform(df)
ab_prob = ab_model.predict_proba(X_ab)[:, 1]
ab_pred = ab_model.predict(X_ab)
print("Predictions from AdaBoost:", ab_pred)


X_et = scaler_et.transform(df)
et_prob = et_model.predict_proba(X_et)[:, 1]
et_pred = et_model.predict(X_et)
print("Predictions from Extra Trees:", et_pred)

X_ann = torch.tensor(df.values, dtype=torch.float32)
with torch.no_grad():
    ann_prob = ann_model(X_ann).numpy().flatten()  # Already sigmoid output
ann_pred = (ann_prob > 0.5).astype(int)

print("Predictions from ANN:", ann_pred.flatten())


X_cnn = torch.tensor(df.values, dtype=torch.float32).unsqueeze(1)
with torch.no_grad():
    cnn_prob = cnn_model(X_cnn).numpy().flatten()  # Already sigmoid output
cnn_pred = (cnn_prob > 0.5).astype(int)

print("Predictions from CNN:", cnn_pred.flatten())

import numpy as np
from scipy.stats import mode

pred_ann = ann_pred
pred_cnn = cnn_pred
pred_gnb = gnb_pred
pred_rf = rf_pred
pred_ab = ab_pred
pred_et = et_pred

prob_ann = ann_prob
prob_cnn = cnn_prob
prob_gnb = gnb_prob
prob_rf = rf_prob
prob_ab = ab_prob
prob_et = et_prob

predictions = np.vstack([pred_ann, pred_cnn, pred_gnb, pred_rf, pred_ab, pred_et])


final_pred, _ = mode(predictions, axis=0)
final_pred = final_pred.flatten()

probabilities = np.vstack([prob_ann, prob_cnn, prob_gnb, prob_rf, prob_ab, prob_et])

final_prob = np.mean(probabilities, axis=0)

for i in range(len(final_pred)):
    if final_pred[i] == 1:
        print(f"Sample {i+1}: ABCP (Probability: {final_prob[i]:.4f})")
    else:
        print(f"Sample {i+1}: NON-ABCP (Probability: {final_prob[i]:.4f})")



Predictions from Gaussian Naive Bayes: [1]
Predictions from Random Forest: [1]
Predictions from AdaBoost: [0]
Predictions from Extra Trees: [1]
Predictions from ANN: [1]
Predictions from CNN: [1]
Sample 1: ABCP (Probability: 0.7690)


