# Tox21 get data

Get Tox21 data from PubChem _via_ the [PUG API](https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html).

In [1]:
%run setup.py

### Config

In [2]:
# PUG API URL to retrieve all SIDs associated with an AID...

sids_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/sids/JSON"

# PUG API URL to retrieve data associated with an AID and a list of SIDs...

data_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/JSON?sid={sids}"

In [3]:
# Number of entities to request data for in each call to PUG API...

chunk_size = 500

In [4]:
# Endpoint columns (ordered)...

endpoint_cols = ['Summary', 'Activity', 'Potency (uM)', 'Efficacy (%)', 'Viability Activity', 'Viability Potency (uM)', 'Viability Efficacy (%)']

In [None]:
# Directory for reading and writing data files...

data_dir = 'data'

### Initialisation

In [5]:
if not 'logger' in locals(): logger = make_logger.run(__name__)

### Reload Tox21 Summary Assays

See [here](0_Tox21_assays.ipynb) for details.

In [6]:
summary_assays_df = pd.read_pickle(os.path.join(data_dir, 'tox21_summary_assays.pkl'))

summary_assays_df.shape

(35, 4)

## Get Tox21 data

Get assay data for Tox21 summary assays.

In [7]:
# Get data for an AID for a list of SIDs...

def do_sids(aid, sids):

    response = requests.get(data_url.format(aid=aid, sids=','.join(str(x) for x in sids)))

    assert response.status_code == 200

    response = response.json()['PC_AssayContainer'][0]

    ### assay_df = pd.DataFrame([(x['tid'], x['name'], x['description'][0], x['type'], x['unit']) for x in response['assay']['descr']['results']], columns=['tid', 'name', 'description', 'type', 'unit'])
    
    assay_df = pd.DataFrame([(x['tid'], x['name']) for x in response['assay']['descr']['results']], columns=['tid', 'name'])

    def f(record):

        df = pd.DataFrame([(y, list(z.values())[0], list(z.keys())[0]) for y, z in [(x['tid'], x['value']) for x in record['data']]], columns=['tid', 'value', 'dtype'])

        df['sid'] = record['sid']

        return df

    return pd.concat(f(x) for x in response['data']).merge(assay_df, on='tid').sort_values(['sid', 'tid']).drop('tid', axis=1)

In [8]:
# Get data for an AID...

def do_aid(aid):
    
    response = requests.get(sids_url.format(aid=aid))

    assert response.status_code == 200

    sids = response.json()['InformationList']['Information'][0]['SID']

    n_sids = len(sids)

    logger.info("\tGot {} SIDs ({} chunks to do)...".format(n_sids, ceil(10001 / chunk_size)))

    dfs = []

    for n, chunk in enumerate((sids[i:i+chunk_size] for i in range(0, n_sids, chunk_size)), 1):

        logger.info("\tStarting chunk {}...".format(n))

        df = do_sids(aid, chunk)

        logger.info("\t...finished chunk: got {} records.".format(df.shape[0]))

        dfs.append(df)

    df = pd.concat(dfs)

    df['aid'] = aid

    return df

In [9]:
n_aids = summary_assays_df['AID'].size
    
dfs = []

logger.info("Starting to process AIDs: {} to do...".format(n_aids))

for n, aid in enumerate(summary_assays_df['AID'], 1):
    
    logger.info("Starting AID {} ({}/{})...".format(aid, n, n_aids))

    df = do_aid(aid)
    
    dfs.append(df)
    
    logger.info("... done AID {}.".format(aid))
    
logger.info("Finished.")

tox21_data_df = pd.concat(dfs).reset_index(drop=True)

[26/01/17 09:03:38 __main__ INFO] Starting to process AIDs: 35 to do...
[26/01/17 09:03:38 __main__ INFO] Starting AID 743228 (1/35)...
[26/01/17 09:03:39 __main__ INFO] 	Got 9305 SIDs (21 chunks to do)...
[26/01/17 09:03:39 __main__ INFO] 	Starting chunk 1...
[26/01/17 09:03:41 __main__ INFO] 	...finished chunk: got 5614 records.
[26/01/17 09:03:41 __main__ INFO] 	Starting chunk 2...
[26/01/17 09:03:43 __main__ INFO] 	...finished chunk: got 5599 records.
[26/01/17 09:03:43 __main__ INFO] 	Starting chunk 3...
[26/01/17 09:03:45 __main__ INFO] 	...finished chunk: got 5623 records.
[26/01/17 09:03:45 __main__ INFO] 	Starting chunk 4...
[26/01/17 09:03:46 __main__ INFO] 	...finished chunk: got 5603 records.
[26/01/17 09:03:46 __main__ INFO] 	Starting chunk 5...
[26/01/17 09:03:48 __main__ INFO] 	...finished chunk: got 5577 records.
[26/01/17 09:03:48 __main__ INFO] 	Starting chunk 6...
[26/01/17 09:03:50 __main__ INFO] 	...finished chunk: got 5586 records.
[26/01/17 09:03:50 __main__ INFO

In [10]:
tox21_data_df.shape

(3185911, 5)

In [11]:
tox21_data_df.head()

Unnamed: 0,value,dtype,sid,name,aid
0,inactive,sval,144206330,Activity Summary,743228
1,inactive,sval,144206330,Ratio Activity,743228
2,0,fval,144206330,Ratio Efficacy (%),743228
3,inactive,sval,144206330,530 nm Activity,743228
4,0,fval,144206330,530 nm Efficacy (%),743228


In [12]:
# Save a copy of the raw data, as it takes a while to generate...

# tox21_data_df.to_pickle(os.path.join(data_dir, 'tox21_data_raw.pkl'))

In [13]:
# tox21_data_df = pd.read_pickle(os.path.join(data_dir, 'tox21_data_raw.pkl'))

# tox21_data_df.shape

In [14]:
# Rename and reorder columns...

tox21_data_df = (
    tox21_data_df
        .rename(columns={'aid': 'AID', 'sid': 'SID', 'name': 'endpoint'})
        [['AID', 'SID', 'endpoint', 'value', 'dtype']]
        .reset_index(drop=True)
)

tox21_data_df.shape

(3185911, 5)

In [15]:
tox21_data_df.head()

Unnamed: 0,AID,SID,endpoint,value,dtype
0,743228,144206330,Activity Summary,inactive,sval
1,743228,144206330,Ratio Activity,inactive,sval
2,743228,144206330,Ratio Efficacy (%),0,fval
3,743228,144206330,530 nm Activity,inactive,sval
4,743228,144206330,530 nm Efficacy (%),0,fval


See [here](0_Tox21_assays.ipynb#assay_endpoints) for more details on the standardisation of assay endpoints.

In [16]:
# Standardise endpoint (a.k.a. 'name')...

tox21_data_df['endpoint_original'] = tox21_data_df['endpoint']

tox21_data_df['endpoint'] = tox21_data_df['endpoint_original'].apply(lambda x: re.sub('^\w+', '', x).strip() if not re.search("nm|Viability|Source|Supplier", x) else x)

In [17]:
tox21_data_df.head()

Unnamed: 0,AID,SID,endpoint,value,dtype,endpoint_original
0,743228,144206330,Summary,inactive,sval,Activity Summary
1,743228,144206330,Activity,inactive,sval,Ratio Activity
2,743228,144206330,Efficacy (%),0,fval,Ratio Efficacy (%)
3,743228,144206330,530 nm Activity,inactive,sval,530 nm Activity
4,743228,144206330,530 nm Efficacy (%),0,fval,530 nm Efficacy (%)


In [18]:
# Inspect changes...

df = tox21_data_df.query("endpoint != endpoint_original").apply(lambda df: "{} -> {}".format(df['endpoint_original'], df['endpoint']), axis=1).value_counts().to_frame('count').reset_index()

df.head(20)

Unnamed: 0,index,count
0,Activity Summary -> Summary,346670
1,Ratio Activity -> Activity,245086
2,Ratio Efficacy (%) -> Efficacy (%),239537
3,Antagonist Activity -> Activity,39487
4,Antagonist Efficacy (%) -> Efficacy (%),36672
5,Ratio Potency (uM) -> Potency (uM),30020
6,Antagonist Potency (uM) -> Potency (uM),10584
7,TR Activity -> Activity,10486
8,AhR Activity -> Activity,10486
9,ATAD5 Activity -> Activity,10486


In [19]:
# Filter rows on the standardised endpoint...

tox21_data_df = tox21_data_df[tox21_data_df['endpoint'].isin(endpoint_cols)]

tox21_data_df.drop('endpoint_original', axis=1, inplace=True) # The original endpoint column is now superfluous

tox21_data_df.shape

(1694115, 5)

In [20]:
tox21_data_df.head()

Unnamed: 0,AID,SID,endpoint,value,dtype
0,743228,144206330,Summary,inactive,sval
1,743228,144206330,Activity,inactive,sval
2,743228,144206330,Efficacy (%),0,fval
7,743228,144206330,Viability Activity,inactive,sval
8,743228,144206330,Viability Efficacy (%),0,fval


In [21]:
# Check number of assays is as expected...

tox21_data_df['AID'].unique().size

35

In [22]:
# Save filtered table of Tox21 assay data...

tox21_data_df.to_pickle(os.path.join(data_dir, 'tox21_data.pkl'))

In [23]:
# tox21_data_df = pd.read_pickle(os.path.join(data_dir, 'tox21_data.pkl'))

# tox21_data_df.shape