In [130]:
""" 
References
1. Biopython handbook for more pdb parsing functions: https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ
2. Deep Learning. structure based solubility prediction: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00488-1
3. Features used by another ML based prediction paper: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-134/tables/3
4. SolArt github: https://github.com/minghuilab/PremPS/blob/e7cf74a467677bad50e20161777ca609362de1e3/v1.0.0/PremPS.py#L304
5. SOLart paper: https://academic.oup.com/bioinformatics/article/36/5/1445/5585748
"""

# from Bio.PDB.PDBParser import PDBParser
# from Bio.PDB import parse_pdb_header

from Bio import SeqIO
import csv

hydrophobic = ['V', 'I', 'L', 'F', 'M', 'W', 'Y', 'C']
negatively_charged = ['D', 'E']
positively_charged = ['R', 'K']
normal_format_pro = ['CYS','GLN','ILE','SER','VAL','MET','ASN','PRO','LYS','THR','PHE','ALA','HIS','GLY','ASP','LEU',
                     'ARG','TRP','GLU','TYR']

# map residue name three letters to one
map_three_one = {"GLY": "G", "ALA": "A", "SER": "S", "THR": "T", "CYS": "C",
                 "VAL": "V", "LEU": "L", "ILE": "I", "MET": "M", "PRO": "P",
                 "PHE": "F", "TYR": "Y", "TRP": "W", "ASP": "D", "GLU": "E",
                 "ASN": "N", "GLN": "Q", "HIS": "H", "LYS": "K", "ARG": "R",
                 "ASX": "X", "GLX": "X", "CSO": "X", "HIP": "X", "MSE": "X",
                 "UNK": "X", "SEC": "X", "PYL": "X", "SEP": "X", "TPO": "X",
                 "PTR": "X", "XLE": "X", "XAA": "X", "HSD": "H", "HID": "H",
                 "HSE": "H"}

# map residue name one letter to three
map_one_three = {"G": "GLY", "A": "ALA", "S": "SER", "T": "THR", "C": "CYS",
                 "V": "VAL", "L": "LEU", "I": "ILE", "M": "MET", "P": "PRO",
                 "F": "PHE", "Y": "TYR", "W": "TRP", "D": "ASP", "E": "GLU",
                 "N": "ASN", "Q": "GLN", "H": "HIS", "K": "LYS", "R": "ARG"}

# SASA_sol
map_surface = {'A':118.1,'R':256.0,'N':165.5,'D':158.7,'C':146.1,'Q':193.2,
               'E':186.2,'G':88.1,'H':202.5,'I':181.0,'L':193.1,'K':225.8,
               'M':203.4,'F':222.8,'P':146.8,'S':129.8,'T':152.5,'W':266.3,
               'Y':236.8,'V':164.5,'X':88.1}

global pdb_list, pdb_matrix, solubility_map
pdb_list = []
pdb_matrix = []
solubility_map = {}

def generate_pdb_list ():
    with open ("pdb_list.txt", "r") as file:
        for line in file:
            pdb_list.append(line.strip())

def generate_solubility_mapping ():
    with open("data/training/crystal_structs/solubility_values.csv", "r") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        for row in csvreader:
            pdb_name = row[0].strip()
            solubility_score = int(row[1].strip())
            solubility_map[pdb_name] = solubility_score
            
def pdb_data_curation():
    if pdb_list:
        for pdb in pdb_list:
            pdb_path = "data/training/crystal_structs/" + pdb
            with open(pdb_path, 'r') as pdb_file:
                for record in SeqIO.parse(pdb_file, 'pdb-atom'):
                    seq = str(record.seq)
                    LysArg, AspGlu, AspGluLysArg, PheTyrTrp = calculate_aa_combos(seq)
                    pdb_name = pdb.rstrip(".pdb")
                    pdb_matrix.append({"PDB File": pdb_name, "Solubility Score": solubility_map[pdb_name] , "Sequence": sequence , "Length": len(seq) , 
                                       "Lys+Arg/Len":LysArg, "Asp+Glu/Len":AspGlu, "Asp+Glu+Lys+Arg/Len":AspGluLysArg, 
                                       "Phe+Tyr+Trp/Len": PheTyrTrp  })
    else:
        print("Pdb list is empty")
    
def calculate_aa_combos (sequence):
    seq_length = len(sequence)
    
    lys_arg = round((sequence.count("K") + sequence.count("R")) /seq_length, 3)
    asp_glu = round((sequence.count("D") + sequence.count("E")) /seq_length, 3)
    asp_glu_lys_arg = round(lys_arg + asp_glu, 3)           
    phe_tyr_trp = round((sequence.count("F")+ sequence.count("Y") + sequence.count("W")) /seq_length, 3)
            
    return lys_arg, asp_glu, asp_glu_lys_arg, phe_tyr_trp

def pdb_to_csv ():
    csv_columns = list(pdb_matrix[0].keys())
    csv_file = "pdb2csv_v2.csv"
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for data in pdb_matrix:
                writer.writerow(data)
    except IOError:
        print("I/O error")

""" Kindly Note: I have left place holders for features that we need to add. 
If you have the time and bandwidth kindly add features via functions"""

# def isolectric_point():
# def steric parameters():
# def hydrophobicity():
# def volume ():
# def polarizability():
# def isoelectric point():
# def helix probability():
# def sheet probability():

def main():
    generate_pdb_list()
    generate_solubility_mapping()
    pdb_data_curation()
    pdb_to_csv()

if __name__ == '__main__':
    main()

