In [53]:
# importing package
import pandas as pd
import numpy as np
import bs4
import requests
import biothings.utils.dataload as bioloading
import re
import os
import csv

In [54]:
# predefined variable
TTD_WEB_LINK = "https://db.idrblab.net/ttd/full-data-download"

# Predefined function

In [55]:
def loadingPage(url):
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)

def obtain_txt_file(txt_url):
    # creating a new output directory if needed
    folder_path = os.getcwd() + '/output_raw_txtFile'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # write down the txt file
    output_fileName = txt_url[txt_url.rfind('/')+1:txt_url.rfind('.')]
    with open(f'output_raw_txtFile/f{output_fileName}.txt', 'w') as file:
        file.write(requests.get(txt_url).text)

# Actual Program

In [56]:
# loading the whole Web Page
main_webPage = loadingPage(TTD_WEB_LINK)

# collecting link to each individual txt file
doc_collections = main_webPage.findAll('td', attrs={'class' : "v-center"})


# downloading file
print('start downloading file...')
nontxt_files = []
txt_files = []
for doc in doc_collections:
    txt_url = 'https://db.idrblab.net/' + doc.find('a').get('href')

    if txt_url[txt_url.rfind('.')+1:] == 'txt':
        obtain_txt_file(txt_url)
        txt_files.append(txt_url)
    else:
        nontxt_files.append(txt_url)
print(f'processed {txt_files}')
print(f"can't process {nontxt_files}")

start downloading file...
processed ['https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-01-TTD_target_download.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-02-TTD_drug_download.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-03-TTD_crossmatching.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-04-Drug_synonyms.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-05-Drug_disease.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-06-Target_disease.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-08-Biomarker_disease.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P1-09-Target_compound_activity.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P2-01-TTD_uniprot_all.txt', 'https://db.idrblab.net//ttd/sites/default/files/ttd_database/P2-02-TTD_uniprot_successful.txt', 'https://db.idrblab.net//ttd/sites/default/

# One example of utilizing ChatGPT to extract data...
if we use ChatGPT APU, it is going to be fast and efficient

In [73]:
play = list(bioloading.tabfile_feeder('output_raw_txtFile/fP1-01-TTD_target_download.txt'))


In [74]:
# The given nested list
nested_list = play

# Define a function to determine if a line is to be skipped
def is_non_data_line(line):
    # Skip empty lines or lines with dashes or single words that are not data lines
    return not line or re.match(r'^-+$', line[0]) or len(line) == 1

# Extract schema
schema_start_idx = 16
schema_end_idx = 28

schema = []
for item in nested_list[schema_start_idx:schema_end_idx]:
    schema.append(item[0])

# Initialize a dictionary for the DataFrame
data_dict = {key: [] for key in schema}
data_dict["DRUGINFO"] = []

# Iterate over the nested list to extract data
current_target = None
for i in range(schema_end_idx + 1, len(nested_list)):
    row = nested_list[i]
    
    if is_non_data_line(row):
        continue  # Skip non-data lines

    if len(row) < 2:
        continue  # Skip rows that do not have enough elements
    
    key = row[1]
    if key == "TARGETID":
        current_target = row[2]
        # Initialize new data entries for this target
        for k in schema:
            data_dict[k].append(None)
        data_dict["TARGETID"][-1] = current_target
        data_dict["DRUGINFO"].append([])
    elif key in schema:
        value = row[2] if len(row) > 2 else None
        data_dict[key][-1] = value
    elif key == "DRUGINFO":
        drug_info = row[2:]
        data_dict["DRUGINFO"][-1].append(drug_info)

# Convert nested DRUGINFO list to a string representation for simplicity
data_dict["DRUGINFO"] = [str(drug_list) for drug_list in data_dict["DRUGINFO"]]

# Create the DataFrame
df = pd.DataFrame(data_dict)

In [76]:
df

Unnamed: 0,TARGETID,FORMERID,UNIPROID,TARGNAME,GENENAME,TARGTYPE,SYNONYMS,FUNCTION,PDBSTRUC,BIOCLASS,ECNUMBER,SEQUENCE,DRUGINFO
0,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,"[['D0O6UY', 'Pemigatinib', 'Approved'], ['D09H..."
1,T59328,TTDS00355,EGFR_HUMAN,Epidermal growth factor receptor (EGFR),EGFR,Successful,Receptor tyrosine-protein kinase erbB-1; Proto...,Receptor tyrosine kinase binding ligands of th...,6D8E; 6B3S; 6ARU; 5ZWJ; 5YU9,Kinase,EC 2.7.10.1,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,"[['D03WPP', 'Necitumumab', 'Approved'], ['D05G..."
2,T89515,TTDS00208,DEFM_HUMAN,Polypeptide deformylase (PDF),PDF,Successful,PDF,Bifunctional enzyme. Involved in de novo dTMP ...,3G5P; 3G5K,CH-NH donor oxidoreductase,EC 3.5.1.88,MARLWGALSLWPLWAAVPWGGAAAVGVRACSSTAAPDGVEGPALRR...,"[['D02LWU', 'Pralatrexate', 'Approved'], ['D0E..."
3,T08391,TTDC00187,JAK2_HUMAN,Janus kinase 2 (JAK-2),JAK2,Successful,Tyrosine-protein kinase JAK2,Mediates essential signaling events in both in...,6M9H; 6E2Q; 6E2P; 6DRW; 6BSS,Kinase,EC 2.7.10.2,MGMACLTMTEMEGTSTSSIYQNGDISGNANSMKQIDPVLQVYLYHS...,"[['D0G8BM', 'Fedratinib', 'Approved'], ['D0D5Z..."
4,T07663,TTDS00296,PDE5A_HUMAN,Phosphodiesterase 5A (PDE5A),PDE5A,Successful,"cGMP-specific 3',5'-cyclic phosphodiesterase; ...",Plays a role in signal transduction by regulat...,6ACB; 5ZZ2; 5JO3; 4OEX; 4OEW,Phosphoric diester hydrolase,EC 3.1.4.35,MERAGPSFGQQRQQQQPQQQKQQQRDQDSVEAWLDDHWDFTFSYFV...,"[['D01GUS', 'Udenafil', 'Approved'], ['D05MQK'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4293,T48436,TTDNC00488,,Ubiquitin-proteasome pathway (UP pathway),,Literature-reported,,,,,,,[]
4294,T17893,TTDI01305,,mRNA-decapping enzyme (DCP),,Literature-reported,,,,,,,[]
4295,T47583,,WRN_HUMAN,Werner syndrome ATP-dependent helicase (WRN),WRN,Literature-reported,"DNA helicase, RecQ-like type 3; RecQ3; Exonucl...",Multifunctional enzyme that has both magnesium...,,,EC 3.1.-.-,MSEKKLETTAQQRKCPEWMNVQNKRCAVEERKACVRKSVFEDDLPF...,"[['DF5LB8', 'NSC 19630', 'Investigative']]"
4296,T50261,,Family,PI3-kinase (PIK3C),,Literature-reported,,,,,,,"[['DHG36S', 'ETP-45658', 'Investigative']]"
