# Create tables in AWS DynamoDb


In [None]:
import json
import requests
import csv
import io
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
jsonf = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/450/JSON")
csvf = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/400/CSV")

In [None]:
FILEPATH = "../data"

class PubChemBioAssayRecordFromCsv(object):
    def __init__(self, assay_id):
        self.assay_id = assay_id
        result = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{0}/CSV".format(self.assay_id))
        self.record = result.text
        
    def save_csv(self):
        with open (os.path.join(FILEPATH, "pubchem_aid{0}.csv".format(self.assay_id)),"w") as f:
            f.write(self.record)
    
    def csv2df(self):
        result = io. StringIO(self.record)
        df = pd.read_csv(result, sep=",")
        st_cols = []
        exp_cols = []
        for c in list(df.columns):
            if "PUBCHEM" in c:
                st_cols += [c]
            else:
                exp_cols += [c]
                df["{}_TYPE".format(c)] = np.nan*(df.shape[0])
                df["{}_DESCR".format(c)] = np.nan*(df.shape[0])
                df["{}_UNIT".format(c)] = np.nan*(df.shape[0])
        return df, st_cols, exp_cols
    
    def get_smiles_from_cid(self):
        df = self.csv2df

In [None]:
r = PubChemBioAssayRecordFromCsv(400)

In [None]:
df, st_cols, exp_cols = r.csv2df()

In [None]:
exp_cols

In [None]:
cols_to_order = []
for c in df.columns:
    if "PUBCHEM" not in c:
        cols_to_order += [c]

In [None]:
new_order = []

for c in exp_cols:
    for c2 in cols_to_order:
        if c in c2:
            new_order += [c2]
all_cols = st_cols + new_order
all_cols

df = df[all_cols]

for c in exp_cols:
    df[0][c]

In [2]:
import json
import requests

PUBCHEM_PREFIX = "PUBCHEM"


class PubChemBioAssayRecordFromJson(object):
    
    def __init__(self, assay_id=None, assay_json_file=None, batch_size=10000):
        self.batch_size=batch_size
        if assay_json_file is not None:
            with open(assay_json_name, "r") as f:
                self.record = json.load(f)
        elif assay_id is not None:
            self.url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{}".format(assay_id)
            r = requests.get("{0}/sids/json".format(self.url))
            self.record = json.loads(r.text)
    
    def get_record(self):
        return self.record
        
    def _chunker(self, seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    def _get_id(self, record):
        return record["InformationList"]["Information"][0]["AID"]
    
    def _get_sids(self, record):
        sids = record["InformationList"]["Information"][0]["SID"]
        sids.sort()
        return sids
    
    def _get_description(self, record):
        req = requests.get("{}/description/json".format(self.url))
        record = json.loads(req.text)
        return record["PC_AssayContainer"][0]["assay"]["descr"]

    def _get_data(self, record):
        data = []
        for chunk in self._chunker(self._get_sids(record), self.batch_size):
            print("HERE")
            print(chunk)
            s = ",".join([str(sid) for sid in chunk])
            r = requests.post("{0}/json".format(self.url), data={'sid': s})
            result = json.loads(r.text)["PC_AssaySubmit"]["data"]
            data.append(result)
        data = [d for chunk in data for d in chunk]
        return data

    def _get_cids_from_sids(self, sids):
        sid2cids = {}
        for chunk in self._chunker(sids, self.batch_size):
            s = ",".join([str(sid) for sid in chunk])
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/cids/json"
            r = requests.post(url, data={'sid': s})
            result = json.loads(r.text)
            if "InformationList" not in result:
                return {}
            result = result["InformationList"]["Information"]
            for res in result:
                if "CID" not in res:
                    continue
                sid2cids[res["SID"]] = res["CID"]
        return sid2cids
    

    def _get_smiles_from_cids(self, cids):
        cid2smiles = {}
        for chunk in self._chunker(cids, self.batch_size):
            s = ",".join([str(cid) for cid in list(set(cids))])
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/property/CanonicalSmiles,IsomericSmiles/JSON"
            r = requests.post(url, data={"cid": s})
            result = json.loads(r.text)["PropertyTable"]["Properties"]
            for res in result:
                if "CanonicalSMILES" in res:
                    cid2smiles[res["CID"]] = res["CanonicalSMILES"]
                elif "IsomericSMILES" in res:
                    cid2smiles[res["CID"]] = res["IsomericSMILES"]
                else:
                    continue
        return cid2smiles
    
    def _get_substances(self, sids):
        sid2cids = self._get_cids_from_sids(sids)
        all_cids = list(set([c for cid in list(sid2cids.values()) for c in cid]))
        if not all_cids:
            return [(sid, None, None) for sid in sids]
        cid2smiles = self._get_smiles_from_cids(all_cids)
        compounds = {}
        for sid in sids:
            if sid not in sid2cids:
                compounds[sid] = [None, None]
            else:
                for cid in sid2cids[sid]:
                    found = False
                    if not found and cid in cid2smiles:
                        compounds[sid] = [cid, cid2smiles[cid]]
                        found = True
                if not found:
                    compounds[sid] = [None, None]
        return compounds
    
    
    def _get_data_with_compounds(self, record):
        sids = self._get_sids(record) 
        compounds = self._get_substances(sids)
        data = self._get_data(record)
        for i in range(len(data)):
            sid = data[i]["sid"]
            data[i]["cid"] = compounds[sid][0]
            data[i]["smiles"] = compounds[sid][1]
        return data
    
    def get(self):
        result = {
            "assay_id": self._get_id(self.record),
            "description": self._get_description(self.record),
            "data": self._get_data_with_compounds(self.record)
        }
        return {"AssayId": "{0}{1}".format(PUBCHEM_PREFIX, result["assay_id"]), 
                "Description": result["description"], 
                "Data": result["data"]}
    
    def save_json(self, path):
        data = self.get()
        assay_id = data["AssayId"]
        with open(os.path.join(path, "{}.json".format(assay_id)), 'w') as outfile:
            json.dump(data, outfile)

In [3]:
r = PubChemBioAssayRecordFromJson(assay_id=1851)
j = r.get_record()

<xs:enumeration value="ppt" ncbi:intvalue="1"/>
<xs:enumeration value="ppm" ncbi:intvalue="2"/>
<xs:enumeration value="ppb" ncbi:intvalue="3"/>
<xs:enumeration value="mm" ncbi:intvalue="4"/>
<xs:enumeration value="um" ncbi:intvalue="5"/>
<xs:enumeration value="nm" ncbi:intvalue="6"/>
<xs:enumeration value="pm" ncbi:intvalue="7"/>
<xs:enumeration value="fm" ncbi:intvalue="8"/>
<xs:enumeration value="mgml" ncbi:intvalue="9"/>
<xs:enumeration value="ugml" ncbi:intvalue="10"/>
<xs:enumeration value="ngml" ncbi:intvalue="11"/>
<xs:enumeration value="pgml" ncbi:intvalue="12"/>
<xs:enumeration value="fgml" ncbi:intvalue="13"/>
<xs:enumeration value="m" ncbi:intvalue="14"/>
<xs:enumeration value="percent" ncbi:intvalue="15"/>
<xs:enumeration value="ratio" ncbi:intvalue="16"/>
<xs:enumeration value="sec" ncbi:intvalue="17"/>
<xs:enumeration value="rsec" ncbi:intvalue="18"/>
<xs:enumeration value="min" ncbi:intvalue="19"/>
<xs:enumeration value="rmin" ncbi:intvalue="20"/>
<xs:enumeration value="day" ncbi:intvalue="21"/>
<xs:enumeration value="rday" ncbi:intvalue="22"/>
<xs:enumeration value="ml-min-kg" ncbi:intvalue="23"/>
<xs:enumeration value="l-kg" ncbi:intvalue="24"/>
<xs:enumeration value="hr-ng-ml" ncbi:intvalue="25"/>
<xs:enumeration value="cm-sec" ncbi:intvalue="26"/>
<xs:enumeration value="mg-kg" ncbi:intvalue="27"/>
<xs:enumeration value="none" ncbi:intvalue="254"/>
<xs:enumeration value="unspecified" ncbi:intvalue="255"/

In [18]:
import os
from collections import OrderedDict

class PubChemBioAssayJsonConverter(object):

    def __init__(self, path, json_file):
        self.name = json_file.split(".")[0]
        with open(os.path.join(path, json_file), "r") as f:
            self.record = json.load(f)
                
    def _get_assay_id(self, record):
        assay_id = record["AssayId"]
        return assay_id
    
    def _get_description(self, record):
        description = record["Description"]
        return description
    
    def _get_data(self, record):
        data = record["Data"]
        return data
    
    def _get_tid_names(self):
        tid2name = {}
        results = self._get_description(self.record)["results"]
        for i in range(len(results)):
            tid = results[i]["tid"]
            name = results[i]["name"]
            tid2name[tid]=name
        return tid2name
    
    def _get_sid_cid_smiles(self):
        data = self._get_data(self.record)
        substances = {}
        for i in range(len(data)):
            substances[i]=[data[i]["sid"], data[i]["cid"], data[i]["smiles"]]
        return substances
    
    def _get_tid_results(self):
        data = self._get_data(self.record)
        sid2value = {}
        for i in range(len(data)):
            results = data[i]["data"]
            tid2value = OrderedDict()
            for i2 in range(len(results)):
                value = results[i2]["value"]
                for k,v in value.items():
                    val = v
                tid2value[results[i2]["tid"]] = val
            sid2value[i] = tid2value
        
        tid2name = self._get_tid_names()
        tid_int = [k for k in tid2name.keys()]

        for t in tid_int:
            for k,v in sid2value.items():
                tid_res = []
                for k2, v2 in v.items():
                    tid_res += [k2]
                if t not in tid_res:
                    v[t]=None
        
        for k,v in sid2value.items():
            for key in tid_int:
                v[key] = v.pop(key)
            
        return sid2value
    
    def _get_outcome(self):
        data = self._get_data(self.record)
        sid2outcome = {}
        for i in range(len(data)):
            sid2outcome[i]=data[i]["outcome"]
        return sid2outcome
    
    def _substances_to_df(self):
        substances = self._get_sid_cid_smiles()
        
        df = pd.DataFrame.from_dict(data=substances, 
                                    orient="index", 
                                    columns = ["sid", "cid", "smiles"])        
        return df
    
    def _outcome_to_df(self):
        outcome = self._get_outcome()
        df = pd.DataFrame.from_dict(data=outcome, 
                                    orient="index", 
                                    columns = ["outcome"])
        return df
    
    def _tid_to_df(self):
        sid2value = self._get_tid_results()
        tid2name = self._get_tid_names()
        tid_names = [k for k in tid2name.values()]
        df = pd.DataFrame.from_dict(data=sid2value, 
                                    orient = "index")
        df.columns = tid_names
        return df
        
    def get_description(self, path):
        descr = self._get_description(self.record)
        with open(os.path.join(path,"{}_descr.txt".format(self.name)), 'w') as f: 
            for k, v in descr.items(): 
                f.write('%s:%s\n' % (k, v))

        
    def get_outcome(self):
        df1 = self._substances_to_df()
        df2 = self._outcome_to_df
        df = pd.concat([df1, df2], axis=1)
        return df
        
    def get_all_results(self):
        df1 = self._substances_to_df()
        df2 = self._outcome_to_df()
        df3 = self._tid_to_df()
        df = pd.concat([df1, df2, df3], axis=1)
        return df
    
    def save_df(self, df, path):
        df.to_csv(os.path.join(path, "{}.csv".format(json_file)), index=False)
        
    def print_json(self):
        return self.record

In [19]:
c = PubChemBioAssayJsonConverter("../","PUBCHEM1851.json")
j = c.get_description(".")

In [None]:
data = j["Data"]
for i in range(len(data)):
    if data[i]["sid"]==26751440:
        print(i)

In [None]:
import requests

result = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/7001/JSON")
#json.loads(result.text)

In [None]:
import json
from pubchempy import get_substances


HEADER = "PC_AssaySubmit"
PUBCHEM_PREFIX = "PUBCHEM"


class PubChemBioAssayRecordFromJson(object):
    
    def __init__(self, json_file):
        with open(file_name, "r") as f:
            self.record = json.load(f)
        
    def get_id(self, record):
        return record[HEADER]["assay"]["descr"]["aid"]["id"]

    def get_sids(self, record):
        sids = []
        for d in record[HEADER]["data"]:
            sids += [d["sid"]]
        return sids    

    def get_compounds_from_sids(self, sids):
        compounds = []
        for subs in get_substances(sids):
            compounds += [(subs.standardized_cid, subs.standardized_compound.canonical_smiles)]
        return compounds

    def get_description(self, record):
        return record[HEADER]["assay"]["descr"]

    def get_data(self, record):
        return record[HEADER]["data"]

    def get_data_with_compounds(self, record):
        sids = self.get_sids(record)
        compounds = self.get_compounds_from_sids(sids)
        data = self.get_data(record)
        for i in range(len(data)):
            sid = data[i]["sid"]
            
            data[i]["cid"] = compounds[i][0]
            data[i]["smiles"] = compounds[i][1]
        return data

    def get(self):
        result = {
            "assay_id": self.get_id(self.record),
            "description": self.get_description(self.record),
            "data": self.get_data_with_compounds(self.record)
        }
        return {KEYNAME: "{0}{1}".format(PUBCHEM_PREFIX, result["assay_id"]), "Description": result["description"], "Data": result["data"]}

In [None]:
#https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Concise/JSON/0000001_0001000.zip

file_name = "../1.concise.json" 
getter = PubChemBioAssayRecordFromJson("../1.concise.json")

In [None]:
data = getter.get()