In [1]:
import pandas as pd
import re
from pathlib import Path
from urllib.request import urlopen
from tqdm import tqdm
import csv

In [2]:
debug_local = True#False
local = Path("..").resolve()
data  = local / "data"
input = data / "input"
output = data / "output"

In [31]:
# read original data into dataFrame
columns = ['HGNC symbol', 'Aliases', 'Common name', 'Description', 'Cytogenetic Location', 'OMIM', 'Ensembl', 'UniProt/Swiss-Prot', 'Entrez Gene', 'UniGene']
inputname=Path(input / "genes.txt").resolve()
df = pd.read_csv(inputname)
df.columns = [columns[0]]
index_rows = df.shape[0];
print("Dimension:", df.shape)
df.head()

Dimension: (885, 1)


Unnamed: 0,HGNC symbol
0,ABCA1
1,ABCA7
2,ABCC4
3,ABCC8
4,ACACA


In [12]:
genes = df[columns[0]].tolist() #list of Gene
# use dictionary instead of a list to sanity check for duplicates and ensure correct order
aliases = {} # Aliases column, 
common_name = {} # Common name column
description = {} # Description column
cyto_loc = {} # Cytogenetic Location column
omim = {} # Cytogenetic Location column
uniprot = {} # UniProt/Swiss-Prot column
entrez = {} # Entrez Gene column
unigene = {} # UniGene column
kgen = {} # 1000 Genomes column
qurl = "https://genomics.senescence.info/longevity/gene.php?id="

In [16]:
rex = {}
grabs = {} # dictionary of lists
re_pre = '<dt>'
re_post = '<\/dt>\s+<dd>(?:<a href=\".*?\">)?(.*?)(?:</a>)?<\/dd>'

nbsp_re = re.compile('&nbsp;');
sel_re = re.compile('Gene details.*?\<dl class=\"section-entry\">\s+(.*?)<\/dl>',re.DOTALL) #search scope

for col in columns:
    rex[col] = re.compile(re_pre+col+re_post,re.DOTALL)
    grabs[col] = []
    
print(grabs)

{'HGNC symbol': [], 'Aliases': [], 'Common name': [], 'Description': [], 'Cytogenetic Location': [], 'OMIM': [], 'Ensembl': [], 'UniProt/Swiss-Prot': [], 'Entrez Gene': [], 'UniGene': []}


In [17]:
# do the grabs
for gene in tqdm(genes):
    page=urlopen(qurl+gene)
    encoding=page.headers.get_content_charset()
    if encoding is None:
        encoding='utf-8'
    content=page.read().decode() #html
    scope=re.search(sel_re, content).group(1).strip() #excluding possible HTML hell for regex by narrowing the scope
    scope=re.sub(nbsp_re, '', scope) #clean nbsp mess
    for col in columns:
        grabs[col].append(re.search(rex[col], scope).group(1).strip()) # do the grabs
                           

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 885/885 [12:23<00:00,  1.19it/s]


In [24]:
#assemble into dataframe
gdf = pd.DataFrame(grabs)
grabs_rows=df.shape[0]
print("Dimension:", df.shape)
gdf.head()

Dimension: (885, 10)


Unnamed: 0,HGNC symbol,Aliases,Common name,Description,Cytogenetic Location,OMIM,Ensembl,UniProt/Swiss-Prot,Entrez Gene,UniGene
0,ABCA1,TGD; ABC1; CERP; ABC-1; HDLDT1,ATP binding cassette subfamily A member 1,The membrane-associated protein encoded by thi...,9q31.1,600046,ENSG00000165029,ABCA1_HUMAN,19,659274
1,ABCA7,AD9; ABCX; ABCA-SSN,ATP binding cassette subfamily A member 7,The protein encoded by this gene is a member o...,19p13.3,605414,ENSG00000064687,ABCA7_HUMAN,10347,134514
2,ABCC4,MRP4; MOATB; MOAT-B,ATP binding cassette subfamily C member 4,The protein encoded by this gene is a member o...,13q32.1,605250,ENSG00000125257,A8K2Q2_HUMAN,10257,508423
3,ABCC8,HI; SUR; HHF1; MRP8; PHHI; SUR1; ABC36; HRINS;...,ATP binding cassette subfamily C member 8,The protein encoded by this gene is a member o...,11p15.1,600509,ENSG00000006071,ABCC8_HUMAN,6833,54470
4,ACACA,ACC; ACAC; ACC1; ACCA; ACACAD,acetyl-CoA carboxylase alpha,Acetyl-CoA carboxylase (ACC) is a complex mult...,17q12,200350,ENSG00000278540,A0A024R0Y2_HUMAN,31,160556


In [32]:
#selfcheck
df=pd.merge(df, gdf, on=columns[0], how="left")
assert df.shape[0] == index_rows, "Output length doesn't match input index length"
assert df.shape[0] == grabs_rows, "Grab length doesn't match input index length"
print("Dimension:", df.shape)
df.head()

Dimension: (885, 10)


Unnamed: 0,HGNC symbol,Aliases,Common name,Description,Cytogenetic Location,OMIM,Ensembl,UniProt/Swiss-Prot,Entrez Gene,UniGene
0,ABCA1,TGD; ABC1; CERP; ABC-1; HDLDT1,ATP binding cassette subfamily A member 1,The membrane-associated protein encoded by thi...,9q31.1,600046,ENSG00000165029,ABCA1_HUMAN,19,659274
1,ABCA7,AD9; ABCX; ABCA-SSN,ATP binding cassette subfamily A member 7,The protein encoded by this gene is a member o...,19p13.3,605414,ENSG00000064687,ABCA7_HUMAN,10347,134514
2,ABCC4,MRP4; MOATB; MOAT-B,ATP binding cassette subfamily C member 4,The protein encoded by this gene is a member o...,13q32.1,605250,ENSG00000125257,A8K2Q2_HUMAN,10257,508423
3,ABCC8,HI; SUR; HHF1; MRP8; PHHI; SUR1; ABC36; HRINS;...,ATP binding cassette subfamily C member 8,The protein encoded by this gene is a member o...,11p15.1,600509,ENSG00000006071,ABCC8_HUMAN,6833,54470
4,ACACA,ACC; ACAC; ACC1; ACCA; ACACAD,acetyl-CoA carboxylase alpha,Acetyl-CoA carboxylase (ACC) is a complex mult...,17q12,200350,ENSG00000278540,A0A024R0Y2_HUMAN,31,160556


In [33]:
# save results together
df.to_csv(output / 'genes_db.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)