# LiverTox parser

LiverTox has been moved from Toxnet to NLM bookshelf with a public domain licensing. It has not been retired and is still updated. NLM bookshelf has an ftp server for retrieving open-licensed materials.  In the process of migrating LiverTox over, it has also been more strictly formatted, making it slightly more easy to parse for information.

To obtain the latest dataset, pull the ftp index from here:
ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/file_list.txt

Next, search for LiverTox and get the url. Every entry in livertox is stored as a pdf and xml file in the compressed file. The filename is the livertox urlstub. The urlbase for LiverTox is https://www.ncbi.nlm.nih.gov/books/n/livertox/


In [1]:
from wikidataintegrator import wdi_core, wdi_login, wdi_helpers
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
import ftplib
from tqdm.notebook import trange, tqdm
import ipywidgets 
import widgetsnbextension
import xml.etree.ElementTree as et 
import time
import os


In [2]:
## Note that the property start date is used for list date.
## When placed in the references, Deltabot moved it out as a qualifier

from datetime import datetime
import copy
def create_reference(prop65_url):
    refStatedIn = wdi_core.WDItemID(value="Q28455381", prop_nr="P248", is_reference=True)
    timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
    refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
    refURL = wdi_core.WDUrl(value=prop65_url, prop_nr="P854", is_reference=True)
    return [refStatedIn, refRetrieved, refURL]

In [None]:
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])


In [3]:
urls_dict = {
    'ftp_index':'ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/file_list.txt',
    'ftp_base':'ftp://ftp.ncbi.nlm.nih.gov/pub/litarch/',
    'bookshelf_base':'https://www.ncbi.nlm.nih.gov/books/n/livertox/',
    'local_data':'data/'
    }

In [12]:
## Fetch the index file to determine the url of the current livertox dataset
#bookshelf_index = ftplib.FTP(urls_dict['ftp_base'])
#bookshelf_index.login()

#print(bookshelf_index.text)

## This is an example of how to use the ftplib to pull things from an ftp server
"""
import os
from ftplib import FTP
 
ftp = FTP("www.myWebsite.com", "USERNAME", "PASSWORD")
ftp.login()
ftp.retrlines("LIST")
 
ftp.cwd("folderOne")
ftp.cwd("subFolder") # or ftp.cwd("folderOne/subFolder")
 
listing = []
ftp.retrlines("LIST", listing.append)
words = listing[0].split(None, 8)
filename = words[-1].lstrip()
 
# download the file
local_filename = os.path.join(r"c:\myfolder", filename)
lf = open(local_filename, "wb")
ftp.retrbinary("RETR " + filename, lf.write, 8*1024)
lf.close()
"""

In [4]:
## Once the file has been downloaded, process it for data of interest
import tarfile
import tempfile
import stat

tar_file = urls_dict['local_data']+'livertox_NBK547852.tar.gz'
tar = tarfile.open(tar_file, "r:gz")
members = tar.getmembers()
print(len(members))

3372


In [102]:
phenotypes = []
drug_types = []
misc_pages = []
drug_pages = []
for member in members:
    if '.nxml' in str(member):
        extracted = tar.extractfile(member)
        tree = et.parse(extracted)
        root = tree.getroot()
        try:
            metainfo = root.find('book-part').find('book-part-meta')
            url_stub = metainfo.find('book-part-id').text
            drug_name = metainfo.find('title-group').find('title').text
            date_info = metainfo.find('pub-history').find('date')
            date_day = date_info.find('day').text
            date_month = date_info.find('month').text
            date_year = date_info.find('year').text
            tmp_date = str(date_year)+"-"+str(date_month)+"-"+str(date_day)
            if date_info.attrib['date-type'] == 'updated':
                update_date = tmp_date
                original_date = None
            else:
                original_date = tmp_date
                update_date = None
                
            basic_meta = {'Title':drug_name,'url_stub':url_stub,
                          'last_update':update_date,'original_date':original_date}
            ## Determine if the page is about a drug, drug class, or phenotype
            if 'Phenotypes' in url_stub:
                phenotypes.append(basic_meta)
            else:
                book_content = root.find('book-part').find('body').find('sec')
                ## We're only interested in the Overview section of the drug and drug class pages
                if ".OVERVIEW" in book_content.attrib['id']:
                    ## If the xml gives a list at the top level of the overview page, it's likely a drug class
                    basic_tags = [elem.tag for elem in book_content.iter()]
                    if 'list' in basic_tags:
                        drug_types.append(basic_meta)
                    ## If it has a hepatotoxicity section, it's likely a drug page                        
                    id_to_find = url_stub+".Hepatotoxicity"
                    for elem in book_content.iter(tag='sec'):
                        if elem.attrib["id"] == id_to_find:
                            ## If it's a drug page, grab the likelihood score
                            for eachp in elem.iter(tag='p'):
                                if "Likelihood score" in eachp.text:
                                    tmpinfo = eachp.text[18:].split(" (")
                                    basic_meta['likelihood_score'] = tmpinfo[0]
                                    basic_meta['score_info'] = tmpinfo[1].replace(").","")
                                    drug_pages.append(basic_meta)
                                  
        except:
            misc_pages.append(str(root))

In [104]:
drug_page_df = pd.DataFrame(drug_pages)
print(drug_page_df.head(n=2))
print(len(drug_page_df))

         Title last_update likelihood_score original_date  \
0      Quinine   2018-5-15                B          None   
1  Leflunomide   2019-4-15                B          None   

                                          score_info     url_stub  
0  highly likely cause of clinically apparent liv...      Quinine  
1  well known cause of idiosyncratic clinically a...  Leflunomide  
720


In [108]:
phenotypes_df = pd.DataFrame(phenotypes)
print(len(phenotypes))
print(phenotypes_df)

18
                                                Title last_update  \
0                               Cholestatic Hepatitis    2019-5-4   
1                                        Liver Tumors    2019-5-4   
2   Sinusoidal Obstruction Syndrome (Veno-occlusiv...    2019-5-4   
3                                   Bland Cholestasis    2019-5-4   
4                                Autoimmune Hepatitis    2019-5-4   
5                                           Cirrhosis    2019-5-4   
6                                     Acute Hepatitis    2019-5-4   
7                    Nodular Regenerative Hyperplasia    2019-5-4   
8                            Immunoallergic Hepatitis    2019-5-4   
9             Phenotypes Of Drug Induced Liver Injury    2019-5-4   
10                                Acute Liver Failure    2019-5-4   
11                                    Mixed Hepatitis    2019-5-4   
12                       Vanishing Bile Duct Syndrome    2019-5-4   
13                           No

In [107]:
drug_types_df = pd.DataFrame(drug_types)
print(len(drug_types_df))
print(drug_types_df)

95
                                         Title last_update likelihood_score  \
0                             Psoriasis Agents   2016-11-2              NaN   
1                        Antiarrhythmic Agents    2017-7-6              NaN   
2                                     Vitamins    2016-5-9              NaN   
3                            Antiemetic Agents   2018-1-15              NaN   
4                             Antiulcer Agents   2019-4-15              NaN   
5          Benign Prostatic Hypertrophy Agents    2018-1-8              NaN   
6                 Penicillins (4th Generation)   2014-1-16              NaN   
7                           Antilipemic Agents    2019-6-4              NaN   
8                               CNS Stimulants   2014-2-13              NaN   
9                                    Alfuzosin    2018-1-8                C   
10                         Antidiabetic Agents    2017-6-6              NaN   
11                            Muscle Relaxants   

[95 rows x 6 columns]


In [None]:
### Filter out drug type entries for which a liver toxicity rating was determined
### Put drug types and phenotypes to mix n match
### Add LiverTox score rubric to Wikidata as Items as well as qualifiers
### Create property proposals for LiverTox IDs and scores