In [21]:
import pandas as pd
import re
from pathlib import Path
from urllib.request import urlopen
from tqdm import outputtqdm
import csv

In [2]:
debug_local = True#False
local = Path("..").resolve()
data  = local / "data"
input = data / "input"
output = data / "output"

In [3]:
# read original data into dataFrame
inputname=Path(input / "longevity_genes_original.csv").resolve()

df = pd.read_csv(inputname)

print("Dimension:", df.shape)
df.head()

Dimension: (550, 7)


Unnamed: 0,id,Association,Population,Variant(s),Gene(s),PubMed,Unnamed: 6
0,1,non-significant,Dutch,HLA-B40,HLA-B,1859103,
1,2,non-significant,Dutch,HLA-DRB5,HLA-DRB5,1859103,
2,3,non-significant,Finnish,APOB,APOB,8018664,
3,4,significant,Finnish,APOC3,APOC3,8018664,
4,5,significant,Finnish,E2/E3/E4,APOE,8018664,


### Data grabbing from site

In [4]:
ids = df['id'].tolist() #list of Ids
# use dictionary instead of a list to sanity check for duplicates and ensure correct order
st_des = {} # Study Design column, 
concl = {} # Conclusions column

qurl = "https://genomics.senescence.info/longevity/entry.php?id="
std_pattern = re.compile('<dt>Study Design<\/dt>\s+<dd>(.*?)<\/dd>',re.DOTALL) 
concl_pattern = re.compile('<dt>Conclusions<\/dt>\s+<dd>(.*?)<\/dd>',re.DOTALL)

In [5]:
for id in tqdm(ids):
    page=urlopen(qurl+id)
    encoding=page.headers.get_content_charset()
    if encoding is None:
        encoding='utf-8'
    content=page.read().decode() #html 
    st_des[id] = re.search(std_pattern, content).group(1).strip()
    concl[id] = re.search(concl_pattern, content).group(1).strip()
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 550/550 [07:57<00:00,  1.15it/s]


In [10]:
#output dataframe
odf = pd.DataFrame({'id': df['id']})
odf.head()

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5


In [18]:
#add descritions
odf['Study Design'] = odf['id'].map(st_des)
odf['Study Design'].str.strip()
odf.head()

Unnamed: 0,id,Study Design,Conclusions
0,1,964 inhabitants aged 85 years and over and 244...,"Without correcting for multiple testing, HLA-D..."
1,2,The apolipoprotein B Xba I polymorphism was ex...,The frequencies of the Xba I alleles among the...
2,3,964 inhabitants aged 85 years and over and 244...,"Without correcting for multiple testing, HLA-B..."
3,4,The Sst I polymorphism was examined in 179 Fin...,The S2 allele (Sst I restriction site present)...
4,5,The common polymorphism of apolipoprotein E (E...,The frequency of the E2 allele was higher and ...


In [19]:
#add conclusions
odf['Conclusions'] = odf['id'].map(concl) 
odf['Conclusions'].str.strip()
odf.head()

Unnamed: 0,id,Study Design,Conclusions
0,1,964 inhabitants aged 85 years and over and 244...,"Without correcting for multiple testing, HLA-D..."
1,2,The apolipoprotein B Xba I polymorphism was ex...,The frequencies of the Xba I alleles among the...
2,3,964 inhabitants aged 85 years and over and 244...,"Without correcting for multiple testing, HLA-B..."
3,4,The Sst I polymorphism was examined in 179 Fin...,The S2 allele (Sst I restriction site present)...
4,5,The common polymorphism of apolipoprotein E (E...,The frequency of the E2 allele was higher and ...


In [23]:
# save results together
odf.to_csv(output / 'longevity_map_descriptions.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)