# Create tables in AWS DynamoDb


In [1]:
import json
import requests
import csv
import io
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
jsonf = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/450/JSON")
csvf = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/400/CSV")

In [None]:
FILEPATH = "../data"

class PubChemBioAssayRecordFromCsv(object):
    def __init__(self, assay_id):
        self.assay_id = assay_id
        result = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{0}/CSV".format(self.assay_id))
        self.record = result.text
        
    def save_csv(self):
        with open (os.path.join(FILEPATH, "pubchem_aid{0}.csv".format(self.assay_id)),"w") as f:
            f.write(self.record)
    
    def csv2df(self):
        result = io. StringIO(self.record)
        df = pd.read_csv(result, sep=",")
        st_cols = []
        exp_cols = []
        for c in list(df.columns):
            if "PUBCHEM" in c:
                st_cols += [c]
            else:
                exp_cols += [c]
                df["{}_TYPE".format(c)] = np.nan*(df.shape[0])
                df["{}_DESCR".format(c)] = np.nan*(df.shape[0])
                df["{}_UNIT".format(c)] = np.nan*(df.shape[0])
        return df, st_cols, exp_cols
    
    def get_smiles_from_cid(self):
        df = self.csv2df

In [None]:
r = PubChemBioAssayRecordFromCsv(400)

In [None]:
df, st_cols, exp_cols = r.csv2df()

In [None]:
exp_cols

In [None]:
cols_to_order = []
for c in df.columns:
    if "PUBCHEM" not in c:
        cols_to_order += [c]

In [None]:
new_order = []

for c in exp_cols:
    for c2 in cols_to_order:
        if c in c2:
            new_order += [c2]
all_cols = st_cols + new_order
all_cols

df = df[all_cols]

for c in exp_cols:
    df[0][c]

In [2]:
import json
import requests

PUBCHEM_PREFIX = "PUBCHEM"


class PubChemBioAssayRecordFromJson(object):
    
    def __init__(self, assay_id=None, assay_json_file=None, batch_size=10000):
        self.batch_size=batch_size
        if assay_json_file is not None:
            with open(assay_json_name, "r") as f:
                self.record = json.load(f)
        elif assay_id is not None:
            self.url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{}".format(assay_id)
            r = requests.get("{0}/sids/json".format(self.url))
            self.record = json.loads(r.text)
    
    def _chunker(self, seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    def _get_id(self, record):
        return record["InformationList"]["Information"][0]["AID"]
    
    def _get_sids(self, record):
        return record["InformationList"]["Information"][0]["SID"]
    
    def _get_description(self, record):
        req = requests.get("{}/description/json".format(self.url))
        record = json.loads(req.text)
        return record["PC_AssayContainer"][0]["assay"]["descr"]

    def _get_data(self, record):
        data = []
        for chunk in self._chunker(self._get_sids(record), self.batch_size):
            s = ",".join([str(sid) for sid in chunk])
            r = requests.post("{0}/json".format(self.url), data={'sid': s})
            result = json.loads(r.text)["PC_AssaySubmit"]["data"]
            data.append(result)
        data = [d for chunk in data for d in chunk]
        return data

    def _get_cids_from_sids(self, sids):
        sid2cids = {}
        for chunk in self._chunker(sids, self.batch_size):
            s = ",".join([str(sid) for sid in chunk])
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/cids/json"
            r = requests.post(url, data={'sid': s})
            result = json.loads(r.text)
            if "InformationList" not in result:
                return {}
            result = result["InformationList"]["Information"]
            for res in result:
                if "CID" not in res:
                    continue
                sid2cids[res["SID"]] = res["CID"]
        return sid2cids
    

    def _get_smiles_from_cids(self, cids):
        cid2smiles = {}
        for chunk in self._chunker(cids, self.batch_size):
            s = ",".join([str(cid) for cid in list(set(cids))])
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/property/CanonicalSmiles,IsomericSmiles/JSON"
            r = requests.post(url, data={"cid": s})
            result = json.loads(r.text)["PropertyTable"]["Properties"]
            for res in result:
                if "CanonicalSMILES" in res:
                    cid2smiles[res["CID"]] = res["CanonicalSMILES"]
                elif "IsomericSMILES" in res:
                    cid2smiles[res["CID"]] = res["IsomericSMILES"]
                else:
                    continue
        return cid2smiles
    
    def _get_substances(self, sids):
        sid2cids = self._get_cids_from_sids(sids)
        all_cids = list(set([c for cid in list(sid2cids.values()) for c in cid]))
        if not all_cids:
            return [(sid, None, None) for sid in sids]
        cid2smiles = self._get_smiles_from_cids(all_cids)
        compounds = {}
        for sid in sids:
            if sid not in sid2cids:
                compounds[sid] = [None, None]
            else:
                for cid in sid2cids[sid]:
                    found = False
                    if not found and cid in cid2smiles:
                        compounds[sid] = [cid, cid2smiles[cid]]
                        found = True
                if not found:
                    compounds[sid] = [None, None]
        return compounds
    
    
    def _get_data_with_compounds(self, record):
        sids = self._get_sids(record) 
        compounds = self._get_substances(sids)
        data = self._get_data(record)
        for i in range(len(data)):
            sid = data[i]["sid"]
            data[i]["cid"] = compounds[sid][0]
            data[i]["smiles"] = compounds[sid][1]
        return data
    
    def get(self):
        result = {
            "assay_id": self._get_id(self.record),
            "description": self._get_description(self.record),
            "data": self._get_data_with_compounds(self.record)
        }
        return {"AssayId": "{0}{1}".format(PUBCHEM_PREFIX, result["assay_id"]), 
                "Description": result["description"], 
                "Data": result["data"]}
    
    def save_json(self, path):
        data = self.get()
        assay_id = data["AssayId"]
        with open(os.path.join(path, "{}.json".format(assay_id)), 'w') as outfile:
            json.dump(data, outfile)

In [3]:
r = PubChemBioAssayRecordFromJson(assay_id=450)
data = r.get()

In [None]:
r.save_json("../")

In [None]:
data["Description"]

In [None]:
data["Data"]

In [None]:
results = data["Data"][1]["data"]
results

tid2value = {}

for i in range(len(results)):
    value = results[i]["value"]
    for k,v in value.items():
        val = v
    tid2value[results[i]["tid"]] = val

In [None]:
tid2name.keys()

In [None]:
results[1]["tid"]

In [None]:
        for k,v in value.items():
            print(k)
            if k in tid2name.keys():
                val = v
            else:
                val = None
        tid2value[results[i]["tid"]] = val

In [7]:
data = data["Data"]

In [11]:
from collections import OrderedDict

sid2value = {}

for i in range(len(data)):
    results = data[i]["data"]
    tid2value = OrderedDict()
    for i2 in range(len(results)):
        value = results[i2]["value"]
        for k,v in value.items():
            val = v
        tid2value[results[i2]["tid"]] = val
    sid2value[i] = tid2value



sid2value

{0: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 1.55e-09),
              (4, -8.81),
              (5, 1.95),
              (6, 0.93),
              (7, 'qHTS Primary'),
              (9, 'Verified'),
              (10, 'Full Curve'),
              (11, '4pHill (AC50,n,S0,Sinf)'),
              (12, 15),
              (13, 90),
              (14, 75),
              (15, 0.145),
              (16, '{1.7, 1.11}'),
              (17, 12),
              (18, 24),
              (19, 49),
              (20, 68),
              (21, 95),
              (22, 83),
              (23, 95),
              (24, 149),
              (25, 92),
              (26, 93),
              (27, 85),
              (28, 64),
              (29, 89),
              (30, 93),
              (31, 83),
              (33, 'qHTS ECL - Prestwick')]),
 1: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 2.03e-09),
              (4, -8.693),
              (5, 2.14),
      

In [12]:
tid_int = [k for k in tid2name.keys()]

tid_names = [k for k in tid2name.values()]

for t in tid_int:
    for k,v in sid2value.items():
        tid_res = []
        for k2, v2 in v.items():
            tid_res += [k2]
        if t not in tid_res:
            v[t]=None

In [13]:
sid2value

{0: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 1.55e-09),
              (4, -8.81),
              (5, 1.95),
              (6, 0.93),
              (7, 'qHTS Primary'),
              (9, 'Verified'),
              (10, 'Full Curve'),
              (11, '4pHill (AC50,n,S0,Sinf)'),
              (12, 15),
              (13, 90),
              (14, 75),
              (15, 0.145),
              (16, '{1.7, 1.11}'),
              (17, 12),
              (18, 24),
              (19, 49),
              (20, 68),
              (21, 95),
              (22, 83),
              (23, 95),
              (24, 149),
              (25, 92),
              (26, 93),
              (27, 85),
              (28, 64),
              (29, 89),
              (30, 93),
              (31, 83),
              (33, 'qHTS ECL - Prestwick'),
              (8, None),
              (32, None)]),
 1: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 2.03e-09),
       

In [15]:
for k,v in sid2value.items():
    for key in tid_int:
        v[key] = v.pop(key)
        
sid2value

{0: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 1.55e-09),
              (4, -8.81),
              (5, 1.95),
              (6, 0.93),
              (7, 'qHTS Primary'),
              (8, None),
              (9, 'Verified'),
              (10, 'Full Curve'),
              (11, '4pHill (AC50,n,S0,Sinf)'),
              (12, 15),
              (13, 90),
              (14, 75),
              (15, 0.145),
              (16, '{1.7, 1.11}'),
              (17, 12),
              (18, 24),
              (19, 49),
              (20, 68),
              (21, 95),
              (22, 83),
              (23, 95),
              (24, 149),
              (25, 92),
              (26, 93),
              (27, 85),
              (28, 64),
              (29, 89),
              (30, 93),
              (31, 83),
              (32, None),
              (33, 'qHTS ECL - Prestwick')]),
 1: OrderedDict([(1, 'increasing'),
              (2, '='),
              (3, 2.03e-09),
       

In [20]:
tid_names

['Activity Direction',
 'Activity Qualifier',
 'Qualified AC50',
 'Log of AC50',
 'Hill Coefficient',
 'Curve R2',
 'Data Type',
 'Compound QC',
 'Data Analysis QC',
 'NCGC Comment',
 'Curve Fit Model',
 'Hill S0',
 'Hill Sinf',
 'Hill dS',
 'Log AC50 Std Error',
 'Excluded Points',
 'Number of Points',
 'Activity at 0.59nM',
 'Activity at 1.319nM',
 'Activity at 2.95nM',
 'Activity at 6.597nM',
 'Activity at 14.751nM',
 'Activity at 0.033uM',
 'Activity at 0.074uM',
 'Activity at 0.165uM',
 'Activity at 0.369uM',
 'Activity at 0.824uM',
 'Activity at 1.844uM',
 'Activity at 4.122uM',
 'Activity at 9.217uM',
 'Activity at 20.61uM',
 'Activity at 0.046mM',
 'Compound Type']

In [21]:
df = pd.DataFrame.from_dict(data = sid2value, orient = "index")
df.columns = tid_names

In [22]:
df

Unnamed: 0,Activity Direction,Activity Qualifier,Qualified AC50,Log of AC50,Hill Coefficient,Curve R2,Data Type,Compound QC,Data Analysis QC,NCGC Comment,...,Activity at 0.074uM,Activity at 0.165uM,Activity at 0.369uM,Activity at 0.824uM,Activity at 1.844uM,Activity at 4.122uM,Activity at 9.217uM,Activity at 20.61uM,Activity at 0.046mM,Compound Type
0,increasing,=,1.550000e-09,-8.810,1.95,0.93,qHTS Primary,,Verified,Full Curve,...,149.0,92.0,93.0,85.0,64.0,89.0,93.0,83.0,,qHTS ECL - Prestwick
1,increasing,=,2.030000e-09,-8.693,2.14,0.89,qHTS Primary,,Verified,Full Curve,...,120.0,73.0,91.0,105.0,82.0,88.0,94.0,101.0,,qHTS ECL - Prestwick
2,increasing,=,2.490000e-09,-8.603,8.73,0.98,qHTS Primary,QC'd by DPI,Verified,Full Curve,...,95.0,,133.0,,88.0,,94.0,86.0,,qHTS MLSMR
3,increasing,=,2.290000e-09,-8.640,2.30,0.83,qHTS Primary,,Verified,Full Curve,...,98.0,107.0,129.0,131.0,102.0,116.0,102.0,85.0,,qHTS ECL - Prestwick
4,increasing,=,3.470000e-09,-8.460,2.97,0.88,qHTS Primary,QC'd by DPI,Verified,Full Curve,...,251.0,,96.0,,78.0,,91.0,111.0,,qHTS MLSMR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10944,inactive,>,4.610000e-03,-2.336,,,qHTS Primary,,Verified,Inactive,...,2.0,1.0,20.0,-2.0,-6.0,,2.0,-3.0,3.0,qHTS ECL - Tocris
10945,inactive,>,4.610000e-03,-2.336,,,qHTS Primary,,Verified,Inactive,...,0.0,4.0,16.0,-7.0,-1.0,,-5.0,-1.0,6.0,qHTS ECL - Tocris
10946,inactive,>,4.610000e-03,-2.336,,,qHTS Primary,,Verified,Inactive,...,-2.0,-2.0,-1.0,-10.0,-6.0,,1.0,0.0,-7.0,qHTS ECL - Tocris
10947,inactive,>,4.610000e-03,-2.336,,,qHTS Primary,,Verified,Inactive,...,-4.0,-7.0,7.0,-4.0,-5.0,,-7.0,-9.0,-8.0,qHTS ECL - Tocris


In [38]:
class PubChemBioAssayJsonConverter(object):

    def __init__(self, path, json_file):
            with open(os.path.join(path, json_file), "r") as f:
                self.record = json.load(f)
                
    def _get_assay_id(self, record):
        assay_id = record["AssayId"]
        return assay_id
    
    def _get_description(self, record):
        description = record["Description"]
        return description
    
    def _get_data(self, record):
        data = record["Data"]
        return data
    
    def _get_tid_names(self):
        tid2name = {}
        results = self._get_description(self.record)["results"]
        for i in range(len(results)):
            tid = results[i]["tid"]
            name = results[i]["name"]
            tid2name[tid]=name
        return tid2name
    
    def _get_sid_cid_smiles(self):
        data = self._get_data(self.record)
        substances = {}
        for i in range(len(data)):
            substances[i]=[data[i]["sid"], data[i]["cid"], data[i]["smiles"]]
        return substances
    
    def _get_tid_results(self):
        data = self._get_data(self.record)
        sid2value = {}
        for i in range(len(data)):
            results = data[i]["data"]
            tid2value = OrderedDict()
            for i2 in range(len(results)):
                value = results[i2]["value"]
                for k,v in value.items():
                    val = v
                tid2value[results[i2]["tid"]] = val
            sid2value[i] = tid2value
        
        tid2name = self._get_tid_names()
        tid_int = [k for k in tid2name.keys()]

        for t in tid_int:
            for k,v in sid2value.items():
                tid_res = []
                for k2, v2 in v.items():
                    tid_res += [k2]
                if t not in tid_res:
                    v[t]=None
        
        for k,v in sid2value.items():
            for key in tid_int:
                v[key] = v.pop(key)
            
        return sid2value
    
    def _get_outcome(self):
        data = self._get_data(self.record)
        sid2outcome = {}
        for i in range(len(data)):
            sid2outcome[i]=data[i]["outcome"]
        return sid2outcome
    
    def _substances_to_df(self):
        substances = self._get_sid_cid_smiles()
        
        df = pd.DataFrame.from_dict(data=substances, 
                                    orient="index", 
                                    columns = ["sid", "cid", "smiles"])        
        return df
    
    def _outcome_to_df(self):
        outcome = self._get_outcome()
        df = pd.DataFrame.from_dict(data=outcome, 
                                    orient="index", 
                                    columns = ["outcome"])
        return df
    
    def _tid_to_df(self):
        sid2value = self._get_tid_results()
        tid2name = self._get_tid_names()
        tid_names = [k for k in tid2name.values()]
        df = pd.DataFrame.from_dict(data=sid2value, 
                                    orient = "index")
        df.columns = tid_names
        return df
        
    def get_outcome(self):
        df1 = self._substances_to_df()
        df2 = self._outcome_to_df
        df = pd.concat([df1, df2], axis=1)
        return df
        
    def get_all_results(self):
        df1 = self._substances_to_df()
        df2 = self._outcome_to_df()
        df3 = self._tid_to_df()
        df = pd.concat([df1, df2, df3], axis=1)
        return df

    

    def print_json(self):
        return self.record

In [39]:
c = PubChemBioAssayJsonConverter("../","PUBCHEM450.json")
df = c.get_all_results()

In [40]:
df

Unnamed: 0,sid,cid,smiles,outcome,Activity Direction,Activity Qualifier,Qualified AC50,Log of AC50,Hill Coefficient,Curve R2,...,Activity at 0.074uM,Activity at 0.165uM,Activity at 0.369uM,Activity at 0.824uM,Activity at 1.844uM,Activity at 4.122uM,Activity at 9.217uM,Activity at 20.61uM,Activity at 0.046mM,Compound Type
0,11112339,6604245.0,CC1CC2C3CCC(C3(CC(C2(C4(C1=CC(=O)C=C4)C)F)O)C)...,2,increasing,=,1.550000e-09,-8.810,1.95,0.93,...,149.0,92.0,93.0,85.0,64.0,89.0,93.0,83.0,,qHTS ECL - Prestwick
1,11112336,6604243.0,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,2,increasing,=,2.030000e-09,-8.693,2.14,0.89,...,120.0,73.0,91.0,105.0,82.0,88.0,94.0,101.0,,qHTS ECL - Prestwick
2,855789,6741.0,CC1CC2C3CCC(C3(CC(C2C4(C1=CC(=O)C=C4)C)O)C)(C(...,2,increasing,=,2.490000e-09,-8.603,8.73,0.98,...,95.0,,133.0,,88.0,,94.0,86.0,,qHTS MLSMR
3,11112906,6604461.0,CCC(=O)OCC(=O)C1(C(CC2C1(CC(C3(C2CCC4=CC(=O)C=...,2,increasing,=,2.290000e-09,-8.640,2.30,0.83,...,98.0,107.0,129.0,131.0,102.0,116.0,102.0,85.0,,qHTS ECL - Prestwick
4,855803,5754.0,CC12CCC(=O)C=C1CCC3C2C(CC4(C3CCC4(C(=O)CO)O)C)O,2,increasing,=,3.470000e-09,-8.460,2.97,0.88,...,251.0,,96.0,,78.0,,91.0,111.0,,qHTS MLSMR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10944,11114235,6604919.0,CC1=C(C(C(=C(N1CC#C)C)C(=O)OC)C2=CC(=CC=C2)N3O...,1,inactive,>,4.610000e-03,-2.336,,,...,2.0,1.0,20.0,-2.0,-6.0,,2.0,-3.0,3.0,qHTS ECL - Tocris
10945,11114238,438981.0,C1=CC2=NC3=C(NN=C3N=C2C=C1)N,1,inactive,>,4.610000e-03,-2.336,,,...,0.0,4.0,16.0,-7.0,-1.0,,-5.0,-1.0,6.0,qHTS ECL - Tocris
10946,11114258,127382.0,CC(C)(C)OC(=O)C1=C2CN(C(=O)C3=C(N2C=N1)C=CS3)C,1,inactive,>,4.610000e-03,-2.336,,,...,-2.0,-2.0,-1.0,-10.0,-6.0,,1.0,0.0,-7.0,qHTS ECL - Tocris
10947,11114261,6604928.0,CSC1=NC(=C(C(=N1)NC2CCCC2)[N+](=O)[O-])NC3CCCC3,1,inactive,>,4.610000e-03,-2.336,,,...,-4.0,-7.0,7.0,-4.0,-5.0,,-7.0,-9.0,-8.0,qHTS ECL - Tocris


In [34]:
df_all = pd.concat([df1, df2, df3], axis=1)

In [35]:
df_all

Unnamed: 0,sid,cid,smiles,outcome,Activity Direction,Activity Qualifier,Qualified AC50,Log of AC50,Hill Coefficient,Curve R2,...,Activity at 0.074uM,Activity at 0.165uM,Activity at 0.369uM,Activity at 0.824uM,Activity at 1.844uM,Activity at 4.122uM,Activity at 9.217uM,Activity at 20.61uM,Activity at 0.046mM,Compound Type
0,11112339,6604245.0,CC1CC2C3CCC(C3(CC(C2(C4(C1=CC(=O)C=C4)C)F)O)C)...,2,increasing,=,1.550000e-09,-8.810,1.95,0.93,...,149.0,92.0,93.0,85.0,64.0,89.0,93.0,83.0,,qHTS ECL - Prestwick
1,11112336,6604243.0,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,2,increasing,=,2.030000e-09,-8.693,2.14,0.89,...,120.0,73.0,91.0,105.0,82.0,88.0,94.0,101.0,,qHTS ECL - Prestwick
2,855789,6741.0,CC1CC2C3CCC(C3(CC(C2C4(C1=CC(=O)C=C4)C)O)C)(C(...,2,increasing,=,2.490000e-09,-8.603,8.73,0.98,...,95.0,,133.0,,88.0,,94.0,86.0,,qHTS MLSMR
3,11112906,6604461.0,CCC(=O)OCC(=O)C1(C(CC2C1(CC(C3(C2CCC4=CC(=O)C=...,2,increasing,=,2.290000e-09,-8.640,2.30,0.83,...,98.0,107.0,129.0,131.0,102.0,116.0,102.0,85.0,,qHTS ECL - Prestwick
4,855803,5754.0,CC12CCC(=O)C=C1CCC3C2C(CC4(C3CCC4(C(=O)CO)O)C)O,2,increasing,=,3.470000e-09,-8.460,2.97,0.88,...,251.0,,96.0,,78.0,,91.0,111.0,,qHTS MLSMR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10944,11114235,6604919.0,CC1=C(C(C(=C(N1CC#C)C)C(=O)OC)C2=CC(=CC=C2)N3O...,1,inactive,>,4.610000e-03,-2.336,,,...,2.0,1.0,20.0,-2.0,-6.0,,2.0,-3.0,3.0,qHTS ECL - Tocris
10945,11114238,438981.0,C1=CC2=NC3=C(NN=C3N=C2C=C1)N,1,inactive,>,4.610000e-03,-2.336,,,...,0.0,4.0,16.0,-7.0,-1.0,,-5.0,-1.0,6.0,qHTS ECL - Tocris
10946,11114258,127382.0,CC(C)(C)OC(=O)C1=C2CN(C(=O)C3=C(N2C=N1)C=CS3)C,1,inactive,>,4.610000e-03,-2.336,,,...,-2.0,-2.0,-1.0,-10.0,-6.0,,1.0,0.0,-7.0,qHTS ECL - Tocris
10947,11114261,6604928.0,CSC1=NC(=C(C(=N1)NC2CCCC2)[N+](=O)[O-])NC3CCCC3,1,inactive,>,4.610000e-03,-2.336,,,...,-4.0,-7.0,7.0,-4.0,-5.0,,-7.0,-9.0,-8.0,qHTS ECL - Tocris


In [None]:
import requests

result = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/7001/JSON")
#json.loads(result.text)

In [None]:
import json
from pubchempy import get_substances


HEADER = "PC_AssaySubmit"
PUBCHEM_PREFIX = "PUBCHEM"


class PubChemBioAssayRecordFromJson(object):
    
    def __init__(self, json_file):
        with open(file_name, "r") as f:
            self.record = json.load(f)
        
    def get_id(self, record):
        return record[HEADER]["assay"]["descr"]["aid"]["id"]

    def get_sids(self, record):
        sids = []
        for d in record[HEADER]["data"]:
            sids += [d["sid"]]
        return sids    

    def get_compounds_from_sids(self, sids):
        compounds = []
        for subs in get_substances(sids):
            compounds += [(subs.standardized_cid, subs.standardized_compound.canonical_smiles)]
        return compounds

    def get_description(self, record):
        return record[HEADER]["assay"]["descr"]

    def get_data(self, record):
        return record[HEADER]["data"]

    def get_data_with_compounds(self, record):
        sids = self.get_sids(record)
        compounds = self.get_compounds_from_sids(sids)
        data = self.get_data(record)
        for i in range(len(data)):
            sid = data[i]["sid"]
            
            data[i]["cid"] = compounds[i][0]
            data[i]["smiles"] = compounds[i][1]
        return data

    def get(self):
        result = {
            "assay_id": self.get_id(self.record),
            "description": self.get_description(self.record),
            "data": self.get_data_with_compounds(self.record)
        }
        return {KEYNAME: "{0}{1}".format(PUBCHEM_PREFIX, result["assay_id"]), "Description": result["description"], "Data": result["data"]}

In [None]:
#https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Concise/JSON/0000001_0001000.zip

file_name = "../1.concise.json" 
getter = PubChemBioAssayRecordFromJson("../1.concise.json")

In [None]:
data = getter.get()