 # README

The following code takes a SASA output tsv file (from https://git.sr.ht/~nsg/DexDesign) and appends descriptors using RCSB PDB Data API.


### Installation
The install requirements are:
- python requests
- source code for Requests: HTTP for Humans. The step-by-step instructions for installation can be found here: https://requests.readthedocs.io/en/latest/user/install/#install

### API
- Information about the RCSB PDB Data GraphQL-based API can be found here: https://data.rcsb.org/index.html#data-api
- The Entry Attributes can be found here: https://data.rcsb.org/data-attributes.html

### Usage
- By altering the query_category and query_item, any Entry Attributes can be retrieved and appended to a tsv file.

In [7]:
#import libraries
import csv
import requests
import pandas as pd

In [8]:
# create description dataframe to store pdb id, structure title
ddf = pd.DataFrame({'PDB ID' : [],
                    'Description' : []})

# Alter Entry Attributes query here
query_category = "struct"
query_item = "title"

In [9]:
# iterate through old TSV file to create new column
with open("./SASA_kCAL01.tsv", 'r') as file:
    
    reader = csv.reader(file, delimiter="\t")
    
    #skip header line
    next(file)
    
    for row in reader:
        #handle empty rows from DSSP errors + bad SASA
        if (not (row)) or (row[8] != "OK"):
            continue
        
        # pull the PDB id from tsv
        pdb_num = row[0]

        # format request content
        payload = 'https://data.rcsb.org/graphql?query={entry(entry_id:"INSERT_ID"){INSERT_C{INSERT_I}}}'
        payload = payload.replace("INSERT_ID", pdb_num, 1)
        payload = payload.replace("INSERT_C", query_category, 1)
        payload = payload.replace("INSERT_I", query_item, 1)
        
        # get response object from GraphQL
        r = requests.get(payload)

        # error handling
        r.raise_for_status()

        # parse JSON response content
        response = r.json()
        descriptor = response['data']['entry'][query_category][query_item]

        # append descriptor to DataFrame
        ddf.loc[len(ddf.index)] = [pdb_num, descriptor]

In [10]:
# create df from TSV + merge
tsv_df = pd.read_csv("./SASA_kCAL01.tsv", delimiter='\t')
tsv_df = tsv_df.merge(ddf, on=["PDB ID"], how='left')
tsv_df.drop_duplicates(subset=["Struct"], keep='first', inplace=True)

# trim rows w/ no description (bad SASA)
tsv_df.drop(tsv_df[tsv_df['Surface?'] == "Bad"].index, inplace=True)

# output new tsv
tsv_df.to_csv('Descript_kCAL01.tsv', index=False, sep='\t')