# LiverTox parser

LiverTox has been moved from Toxnet to NLM bookshelf with a public domain licensing. It has not been retired and is still updated. NLM bookshelf has an ftp server for retrieving open-licensed materials.  In the process of migrating LiverTox over, it has also been more strictly formatted, making it slightly more easy to parse for information.

To obtain the latest dataset, pull the ftp index from here:
ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/file_list.txt

Next, search for LiverTox and get the url. Every entry in livertox is stored as a pdf and xml file in the compressed file. The filename is the livertox urlstub. The urlbase for LiverTox is https://www.ncbi.nlm.nih.gov/books/n/livertox/


In [1]:
from wikidataintegrator import wdi_core, wdi_login, wdi_helpers
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
import ftplib
from tqdm.notebook import trange, tqdm
import ipywidgets 
import widgetsnbextension
import xml.etree.ElementTree as et 
import time
import os


In [33]:
## Note that the property start date is used for list date.
## When placed in the references, Deltabot moved it out as a qualifier

from datetime import datetime
import copy
def create_reference(LiverToxUrl,LastUpdate):
    refStatedIn = wdi_core.WDItemID(value="Q78239405", prop_nr="P248", is_reference=True)
    timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
    refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
    refURL = wdi_core.WDUrl(value=LiverToxUrl, prop_nr="P854", is_reference=True)
    refUpdate = wdi_core.WDTime(LastUpdate, prop_nr="P5017", is_reference=True)
    return [refStatedIn, refRetrieved, refURL, refUpdate]

In [6]:
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])


Logging in...
Successfully logged in as Gtsulab


In [7]:
urls_dict = {
    'ftp_index':'ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/file_list.txt',
    'ftp_base':'ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/',
    'bookshelf_base':'https://www.ncbi.nlm.nih.gov/books/n/livertox/',
    'local_data':'data/'
    }

In [None]:
## Fetch the index file to determine the url of the current livertox dataset
#bookshelf_index = ftplib.FTP(urls_dict['ftp_base'])
#bookshelf_index.login()

#print(bookshelf_index.text)

## This is an example of how to use the ftplib to pull things from an ftp server
"""
import os
from ftplib import FTP
 
ftp = FTP("www.myWebsite.com", "USERNAME", "PASSWORD")
ftp.login()
ftp.retrlines("LIST")
 
ftp.cwd("folderOne")
ftp.cwd("subFolder") # or ftp.cwd("folderOne/subFolder")
 
listing = []
ftp.retrlines("LIST", listing.append)
words = listing[0].split(None, 8)
filename = words[-1].lstrip()
 
# download the file
local_filename = os.path.join(r"c:\myfolder", filename)
lf = open(local_filename, "wb")
ftp.retrbinary("RETR " + filename, lf.write, 8*1024)
lf.close()
"""

In [8]:
## Once the file has been downloaded, process it for data of interest
import tarfile
import tempfile
import stat

tar_file = urls_dict['local_data']+'livertox_NBK547852.tar.gz'
tar = tarfile.open(tar_file, "r:gz")
members = tar.getmembers()
print(len(members))

3155


In [9]:
phenotypes = []
drug_types = []
misc_pages = []
drug_pages = []
for member in members:
    if '.nxml' in str(member):
        extracted = tar.extractfile(member)
        tree = et.parse(extracted)
        root = tree.getroot()
        try:
            metainfo = root.find('book-part').find('book-part-meta')
            url_stub = metainfo.find('book-part-id').text
            drug_name = metainfo.find('title-group').find('title').text
            date_info = metainfo.find('pub-history').find('date')
            date_day = date_info.find('day').text
            date_month = date_info.find('month').text
            date_year = date_info.find('year').text
            tmp_date = str(date_year)+"-"+str(date_month)+"-"+str(date_day)
            if date_info.attrib['date-type'] == 'updated':
                update_date = tmp_date
                original_date = None
            else:
                original_date = tmp_date
                update_date = None
                
            basic_meta = {'Title':drug_name,'url_stub':url_stub,
                          'last_update':update_date,'original_date':original_date}
            ## Determine if the page is about a drug, drug class, or phenotype
            if 'Phenotypes' in url_stub:
                phenotypes.append(basic_meta)
            else:
                book_content = root.find('book-part').find('body').find('sec')
                ## We're only interested in the Overview section of the drug and drug class pages
                if ".OVERVIEW" in book_content.attrib['id']:
                    ## If the xml gives a list at the top level of the overview page, it's likely a drug class
                    basic_tags = [elem.tag for elem in book_content.iter()]
                    if 'list' in basic_tags:
                        drug_types.append(basic_meta)
                    ## If it has a hepatotoxicity section, it's likely a drug page                        
                    id_to_find = url_stub+".Hepatotoxicity"
                    for elem in book_content.iter(tag='sec'):
                        if elem.attrib["id"] == id_to_find:
                            ## If it's a drug page, grab the likelihood score
                            for eachp in elem.iter(tag='p'):
                                if "Likelihood score" in eachp.text:
                                    tmpinfo = eachp.text[18:].split(" (")
                                    basic_meta['likelihood_score'] = tmpinfo[0]
                                    basic_meta['score_info'] = tmpinfo[1].replace(").","")
                                    drug_pages.append(basic_meta)
                                  
        except:
            misc_pages.append(str(root))

In [10]:
drug_page_df = pd.DataFrame(drug_pages)
phenotypes_df = pd.DataFrame(phenotypes)
drug_types_df = pd.DataFrame(drug_types)
print(len(drug_page_df))

918


In [None]:
## Export Results for Mix N Match

#drug_types_df.to_csv('results/drug_types_df.tsv',sep='\t',header=True)
#phenotypes_df.to_csv('results/phenotypes_df.tsv',sep='\t',header=True)
#drug_page_df.to_csv('results/drug_page_df.tsv',sep='\t',header=True)

In [11]:
### Filter out LiverTox entries for which a liver toxicity rating was determined

liver_tox_scores = pd.concat((drug_page_df.loc[~drug_page_df['likelihood_score'].isnull()],
                              drug_types_df.loc[~drug_page_df['likelihood_score'].isnull()]), ignore_index=True)
liver_tox_scores.drop_duplicates(keep='first',inplace=True)
print(len(liver_tox_scores))
#print(liver_tox_scores.head(n=2))

987


In [12]:
## Pull Items mapped to LiverTox IDs
sparqlQuery = "SELECT ?item ?livertox WHERE {?item wdt:P7830 ?livertox}"
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
livertox_in_wd_list = []

i=0
while i < len(result["results"]["bindings"]):
    livertox_id = result["results"]["bindings"][i]["livertox"]["value"]
    wdid = result["results"]["bindings"][i]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
    livertox_in_wd_list.append({'WDID':wdid,'url_stub':livertox_id})
    i=i+1

livertox_in_wd = pd.DataFrame(livertox_in_wd_list)
print(livertox_in_wd.head(n=2))

      WDID     url_stub
0  Q337668  Acamprosate
1  Q338005     Acarbose


In [13]:
## Results to generate list of Scores to be added
livertox2add = liver_tox_scores.merge(livertox_in_wd,on='url_stub',how='inner')
print(len(livertox2add))
print(livertox2add.head(n=2))
livertox2add.to_csv('results/livertox2add.tsv',sep='\t',header=True,encoding='UTF-8')

709
         Title     url_stub last_update original_date likelihood_score  \
0      Quinine      Quinine   2018-5-15          None                B   
1  Leflunomide  Leflunomide   2019-4-15          None                B   

                                          score_info     WDID  
0  highly likely cause of clinically apparent liv...  Q189522  
1  well known cause of idiosyncratic clinically a...  Q248550  


In [None]:
### Add LiverTox score rubric to Wikidata as Items as well as qualifiers
### Create property proposals for LiverTox IDs and scores
### Add the actual score text using Stated as (P1932) string
### Model the high dose scores with Has contributing factor (P1479) or has cause (P828)
### LiverTox score property proposal approved (P8026)

In [20]:
livertox2add = read_csv('results/livertox2add.tsv',delimiter='\t',header=0,encoding='UTF-8',index_col=0)
livertox2add['scorelength'] = livertox2add['likelihood_score'].str.len()
no_quals = livertox2add.loc[livertox2add['scorelength']<=2.0].copy()
qualed = livertox2add.loc[(~livertox2add['likelihood_score'].isna())&
                          (livertox2add['scorelength']>2)]
highdose = qualed.loc[(qualed['likelihood_score'].str.contains('H'))&
                      (qualed['scorelength']<6)]

issues = qualed.loc[~qualed['WDID'].isin(highdose['WDID'].unique().tolist())]
print(no_quals.head(n=2))

         Title     url_stub last_update  original_date likelihood_score  \
0      Quinine      Quinine   2018-5-15            NaN                B   
1  Leflunomide  Leflunomide   2019-4-15            NaN                B   

                                          score_info     WDID  scorelength  
0  highly likely cause of clinically apparent liv...  Q189522          1.0  
1  well known cause of idiosyncratic clinically a...  Q248550          1.0  


In [18]:
print(no_quals['likelihood_score'].unique().tolist())

['B', 'E', 'D', 'A', 'C', 'E*']


In [32]:
score_dict = {"A":"Q83283320", 
              "B":"Q83284157",
              "C":"Q83284310",
              "D":"Q83284515",
              "E":"Q83284667",
              "E*":"Q83284878",
              "X":"Q83285040",
              "HD":"Q83285233" 
             }

no_quals['likelihood_score'].replace(score_dict,inplace=True)
no_quals['url'] = 'https://www.ncbi.nlm.nih.gov/books/n/livertox/'+ no_quals['url_stub'].astype(str)
print(no_quals.iloc[0]['url'])
print(no_quals.head(n=2))

https://www.ncbi.nlm.nih.gov/books/n/livertox/Quinine
         Title     url_stub last_update  original_date likelihood_score  \
0      Quinine      Quinine   2018-5-15            NaN        Q83284157   
1  Leflunomide  Leflunomide   2019-4-15            NaN        Q83284157   

                                          score_info     WDID  scorelength  \
0  highly likely cause of clinically apparent liv...  Q189522          1.0   
1  well known cause of idiosyncratic clinically a...  Q248550          1.0   

                                                 url  
0  https://www.ncbi.nlm.nih.gov/books/n/livertox/...  
1  https://www.ncbi.nlm.nih.gov/books/n/livertox/...  


In [35]:
print(no_quals.iloc[0]['last_update'])
print(datetime.strptime(no_quals.iloc[0]['last_update'],"%Y-%m-%d"))
print(datetime.strptime(no_quals.iloc[0]['last_update'],"%Y-%m-%d").strftime("+%Y-%m-%dT00:00:00Z"))


2018-5-15
2018-05-15 00:00:00
+2018-05-15T00:00:00Z


In [37]:
## Unit test-- write a statement
drug_qid = 'Q189522'
phen_qid = 'Q83284157'
LiverToxUrl = no_quals.iloc[0]['url']
LastUpdate = datetime.strptime(no_quals.iloc[0]['last_update'],"%Y-%m-%d").strftime("+%Y-%m-%dT00:00:00Z")
reference = create_reference(LiverToxUrl,LastUpdate)
statement = [wdi_core.WDItemID(value=phen_qid, prop_nr="P8026", references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=drug_qid, data=statement, append_value="P8026",
                       global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)


'Q189522'

In [None]:
i=1
while i < len(no_quals):
    drug_qid = no_quals.iloc[i]['WDID']
    phen_qid = no_quals.iloc[i]['likelihood_score']
    LiverToxUrl = no_quals.iloc[i]['url']
    LastUpdate = datetime.strptime(no_quals.iloc[i]['last_update'],"%Y-%m-%d").strftime("+%Y-%m-%dT00:00:00Z")
    reference = create_reference(LiverToxUrl,LastUpdate)
    statement = [wdi_core.WDItemID(value=phen_qid, prop_nr="P8026", references=[copy.deepcopy(reference)])]
    item = wdi_core.WDItemEngine(wd_item_id=drug_qid, data=statement, append_value="P8026",
                           global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
    item.write(login)  
    i=i+1
    time.sleep(0.5)