In [104]:
!pip install --user freesasa

Collecting freesasa
Installing collected packages: freesasa
Successfully installed freesasa-2.1.0
[33mYou are using pip version 18.1, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
ROOT_DIR = !pwd

In [17]:
print('a'.upper())

A


In [18]:
""" 
References
1. Biopython handbook for more pdb parsing functions: https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ
2. Deep Learning. structure based solubility prediction: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00488-1
3. Features used by another ML based prediction paper: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-134/tables/3
4. SolArt github: https://github.com/minghuilab/PremPS/blob/e7cf74a467677bad50e20161777ca609362de1e3/v1.0.0/PremPS.py#L304
5. SOLart paper: https://academic.oup.com/bioinformatics/article/36/5/1445/5585748
"""

# from Bio.PDB.PDBParser import PDBParser
# from Bio.PDB import parse_pdb_header

from Bio import SeqIO
import csv
import freesasa
import os
from os.path import join as join_p
import predict

ROOT_DIR = os.getcwd()
hydrophobic = ['V', 'I', 'L', 'F', 'M', 'W', 'Y', 'C']
negatively_charged = ['D', 'E']
positively_charged = ['R', 'K']
normal_format_pro = ['CYS','GLN','ILE','SER','VAL','MET','ASN','PRO','LYS','THR','PHE','ALA','HIS','GLY','ASP','LEU',
                     'ARG','TRP','GLU','TYR']

# map residue name three letters to one
map_three_one = {"GLY": "G", "ALA": "A", "SER": "S", "THR": "T", "CYS": "C",
                 "VAL": "V", "LEU": "L", "ILE": "I", "MET": "M", "PRO": "P",
                 "PHE": "F", "TYR": "Y", "TRP": "W", "ASP": "D", "GLU": "E",
                 "ASN": "N", "GLN": "Q", "HIS": "H", "LYS": "K", "ARG": "R",
                 "ASX": "X", "GLX": "X", "CSO": "X", "HIP": "X", "MSE": "X",
                 "UNK": "X", "SEC": "X", "PYL": "X", "SEP": "X", "TPO": "X",
                 "PTR": "X", "XLE": "X", "XAA": "X", "HSD": "H", "HID": "H",
                 "HSE": "H"}

# map residue name one letter to three
map_one_three = {"G": "GLY", "A": "ALA", "S": "SER", "T": "THR", "C": "CYS",
                 "V": "VAL", "L": "LEU", "I": "ILE", "M": "MET", "P": "PRO",
                 "F": "PHE", "Y": "TYR", "W": "TRP", "D": "ASP", "E": "GLU",
                 "N": "ASN", "Q": "GLN", "H": "HIS", "K": "LYS", "R": "ARG"}

# SASA_sol
map_surface = {'A':118.1,'R':256.0,'N':165.5,'D':158.7,'C':146.1,'Q':193.2,
               'E':186.2,'G':88.1,'H':202.5,'I':181.0,'L':193.1,'K':225.8,
               'M':203.4,'F':222.8,'P':146.8,'S':129.8,'T':152.5,'W':266.3,
               'Y':236.8,'V':164.5,'X':88.1}

global pdb_list, pdb_matrix, solubility_map
pdb_list = []
pdb_matrix = []
solubility_map = {}

def generate_pdb_list ():
    with open ("pdb_list.txt", "r") as file:
        for line in file:
            pdb_list.append(line.strip())

def generate_solubility_mapping ():
    with open("data/training/crystal_structs/solubility_values.csv", "r") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        for row in csvreader:
            pdb_name = row[0].strip()
            solubility_score = int(row[1].strip())
            solubility_map[pdb_name] = solubility_score
            
def pdb_data_curation():
    if pdb_list:
        for pdb in pdb_list:
            pdb_path = join_p(ROOT_DIR,"data/training/crystal_structs/" + pdb)
            structure = freesasa.Structure(pdb_path)
            result = freesasa.calc(structure)
            area_classes = freesasa.classifyResults(result, structure)

            with open(pdb_path, 'r') as pdb_file:
                
                #compute DSSP and related features.
                try:
                    sec_str_based_features=predict.compute_dssp_based(pdb_path)
                except Exception as e:
                    print(pdb_path+' failed to extract secondary structure features')
                    print(e)

                for record in SeqIO.parse(pdb_file, 'pdb-atom'):
                    seq = str(record.seq)
                    LysArg, AspGlu, AspGluLysArg, PheTyrTrp = calculate_aa_combos(seq)
                    pdb_name = pdb.rstrip(".pdb")
                    new_feats = {"PDB File": pdb_name, "Solubility Score": solubility_map[pdb_name] , "Sequence": seq, "Length": len(seq) , 
                                       "Lys+Arg/Len":LysArg, "Asp+Glu/Len":AspGlu, "Asp+Glu+Lys+Arg/Len":AspGluLysArg, 
                                       "Phe+Tyr+Trp/Len": PheTyrTrp,
                                       "Polar": area_classes['Polar'],
                                       "Apolar": area_classes['Apolar']}
                    new_feats.update(sec_str_based_features)
                    pdb_matrix.append(new_feats)
    else:
        print("Pdb list is empty")
    
def calculate_aa_combos (sequence):
    seq_length = len(sequence)
    
    lys_arg = round((sequence.count("K") + sequence.count("R")) /seq_length, 3)
    asp_glu = round((sequence.count("D") + sequence.count("E")) /seq_length, 3)
    asp_glu_lys_arg = round(lys_arg + asp_glu, 3)           
    phe_tyr_trp = round((sequence.count("F")+ sequence.count("Y") + sequence.count("W")) /seq_length, 3)
            
    return lys_arg, asp_glu, asp_glu_lys_arg, phe_tyr_trp

def pdb_to_csv ():
    csv_columns = list(pdb_matrix[0].keys())
    csv_file = join_p(ROOT_DIR,"pdb2csv_v2.csv")
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for data in pdb_matrix:
                writer.writerow(data)
    except IOError:
        print("I/O error")

""" Kindly Note: I have left place holders for features that we need to add. 
If you have the time and bandwidth kindly add features via functions"""

# def isolectric_point():
# def steric parameters():
# def hydrophobicity():
# def volume ():
# def polarizability():
# def isoelectric point():
# def helix probability():
# def sheet probability():

def main():
    generate_pdb_list()
    generate_solubility_mapping()
    pdb_data_curation()
    pdb_to_csv()

if __name__ == '__main__':
    main()

TabError: inconsistent use of tabs and spaces in indentation (<ipython-input-18-cc8ad1adfc4a>, line 84)

In [25]:
!pip install  --user DSSPparser

Collecting DSSPparser
Installing collected packages: DSSPparser
Successfully installed DSSPparser-0.12
[33mYou are using pip version 18.1, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [31]:
    #import DSSPparser
    #import DSSParser
    #import DSSparser
    from DSSPparser import parseDSSP
    dssp_file = "/home/sadali/projects/copenhagen_bio_hackathon/cbh21-protein-solubility-challenge/data/training/crystal_structs/dssp_out/B7MIY4.pdb.dssp"
    parse_dssp = parseDSSP(dssp_file)
    parse_dssp.parse()


    pddict = parse_dssp.dictTodataframe()
    print(pddict)


    resnum inscode chain aa struct structdetails bp1 bp2  acc   h_nho1  ...  \
0        1       1     A  M                >       0   0  110   0, 0.0  ...   
1        2       2     A  D      H      H  >  +    0   0  129   2,-0.2  ...   
2        3       3     A  L      H      H  > S+    0   0   81   2,-0.2  ...   
3        4       4     A  A      H      H  > S+    0   0   61   2,-0.2  ...   
4        5       5     A  S      H      H  X S+    0   0   67  -4,-1.6  ...   
..     ...     ...   ... ..    ...           ...  ..  ..  ...      ...  ...   
203    204     204     A  W      H      H  X S+    0   0   74  -4,-3.1  ...   
204    205     205     A  A      H      H  < S+    0   0    0  -4,-2.1  ...   
205    206     206     A  D      H      H  < S+    0   0   49  -4,-1.8  ...   
206    207     207     A  A      H      H  <       0   0   68  -4,-1.7  ...   
207    208     208     A  V                <       0   0   44  -4,-2.0  ...   

       tco  kappa  alpha    phi    psi   xca   yca 

In [13]:
import importlib
import predict
#from predict import compute_dssp_based
importlib.reload(predict)

<module 'predict' from '/home/sadali/projects/copenhagen_bio_hackathon/cbh21-protein-solubility-challenge/predict.py'>

In [101]:
import os

from os.path import join as join_p
import DSSPparser
map_surface = {'A': 118.1, 'R': 256.0, 'N': 165.5, 'D': 158.7, 'C': 146.1, 'Q': 193.2,
               'E': 186.2, 'G': 88.1, 'H': 202.5, 'I': 181.0, 'L': 193.1, 'K': 225.8,
               'M': 203.4, 'F': 222.8, 'P': 146.8, 'S': 129.8, 'T': 152.5, 'W': 266.3,
               'Y': 236.8, 'V': 164.5, 'X': 88.1}

DSSP_codes = {"H" # = α-helix
              "B", #= residue in isolated β-bridge
              "E",  # = extended strand, participates in β ladder
              "G",  # = 3-helix(310 helix)
              "I",  # = 5 helix(π-helix)
              "T",  # = hydrogen bonded turn
              "S",  # = bend
}

predict.compute_dssp_based(join_p("/home/sadali","projects/copenhagen_bio_hackathon/cbh21-protein-solubility-challenge","data/training/crystal_structs","A0A140NA.pdb"))


DSSP run:
 /usr/bin/mkdssp -i /home/sadali/projects/copenhagen_bio_hackathon/cbh21-protein-solubility-challenge/data/training/crystal_structs/A0A140NA.pdb -o /tmp/A0A140NA.pdb/A0A140NA.dssp
DSSP ret code 0
{'A'}
defaultdict(None, {'': ['exposed', 'mod_buried', 'buried', 'mod_buried', 'mod_buried', 'mod_buried', 'mod_buried', 'buried', 'mod_buried', 'exposed', 'mod_buried', 'buried', 'exposed', 'buried', 'mod_buried', 'mod_buried', 'exposed'], 'E': ['exposed', 'mod_buried', 'mod_buried', 'mod_buried', 'mod_buried', 'mod_buried', 'mod_buried', 'buried', 'buried', 'mod_buried', 'buried', 'mod_buried', 'buried', 'buried', 'buried', 'buried', 'buried', 'mod_buried', 'mod_buried', 'buried', 'buried', 'buried', 'buried', 'buried', 'mod_buried', 'buried', 'mod_buried'], 'H': ['exposed', 'mod_buried', 'buried', 'mod_buried', 'exposed', 'buried', 'buried', 'exposed', 'mod_buried', 'buried', 'exposed', 'exposed', 'buried', 'exposed', 'mod_buried', 'buried', 'exposed', 'exposed', 'buried', 'buried

defaultdict(float,
            {'helix_buried': 0.022540983606557378,
             'helix_mod_buried': 0.018442622950819672,
             'helix_exposed': 0.018442622950819672,
             'beta_buried': 0.0,
             'beta_mod_buried': 0.0,
             'beta_exposed': 0.0,
             'coil_buried': 0.0,
             'coil_mod_buried': 0.0,
             'coil_exposed': 0.0,
             'buried_A': 0.004098360655737705,
             'mod_buried_A': 0.0020491803278688526,
             'exposed_A': 0.004098360655737705,
             'buried_R': 0.0020491803278688526,
             'mod_buried_R': 0.006147540983606557,
             'exposed_R': 0.0020491803278688526,
             'buried_N': 0,
             'mod_buried_N': 0,
             'exposed_N': 0.006147540983606557,
             'buried_D': 0.004098360655737705,
             'mod_buried_D': 0.004098360655737705,
             'exposed_D': 0.006147540983606557,
             'buried_C': 0,
             'mod_buried_C': 0,
      