# Gene card scraper

this notebook loads the data from the `data/gene_list/genes.list` file.

and scrapes Gene cards database specifically trying to grab the data located in the `https://www.genecards.org/cgi-bin/carddisp.pl?gene=${GENE}`

In [81]:
# options
sub_seperator = ","

# load list of genes
gene_rows = []
with open("../../data/gene_list/genes.list","r") as f:
  gene_rows = [line.strip() for line in f.readlines()]

gene_set = set()
genes = []
for g in gene_rows:
  for sg in g.split(sub_seperator):
    if sg not in gene_set:
      gene_set.add(sg)
      genes.append(sg)

print("{} genes loaded".format(len(genes)))

150 genes loaded


In [82]:
genes[:5]

['CFAP74', 'PRDM16-DT', 'AGMAT', 'MACF1', 'NFIA']

In [57]:
# load the libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re

In [109]:
selected_gene = genes[0]
genecard_url = 'https://www.genecards.org/cgi-bin/carddisp.pl?gene={}'

selected_url = genecard_url.format(selected_gene)

payload={}

UAS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
       "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
       "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
       )

ua = UAS[random.randrange(len(UAS))]
headers = {
  'user-agent':ua
}

response = requests.request("GET", selected_url, headers=headers, data=payload)

#print(response.text)

soup = BeautifulSoup(response.text, 'html.parser')
all_divs = soup.findAll("section",id="summaries")
print(len(all_divs))
#sub_soup = BeautifulSoup(all_divs[0],"html.parser")
all_subsections = all_divs[0].findAll("div",class_="gc-subsection")
print(len(all_subsections))
print(all_subsections[0])

h1 = soup.findAll("h1",id="geneSymbol")
if h1 and h1[0]:
  print(h1[0].findAll("small"))

1
5
<div class="gc-subsection">
<div class="gc-subsection-header">
<h3>Entrez Gene Summary for CFAP74 Gene</h3>
<a class="gc-ga-link glyphicon glyphicon-new-window" data-ga-action="ENT" data-ga-source-accession="85452" href="https://www.ncbi.nlm.nih.gov/gene/85452#summary" target="_blank" title="See NCBI Entrez Gene entry for CFAP74"></a>
</div>
<ul class="list-unstyled">
<li>
<p>Predicted to be involved in axoneme assembly. Predicted to be located in axoneme. [provided by Alliance of Genome Resources, Apr 2022]</p>
</li>
</ul>
</div>
[<small>
                    Gene
                        <span id="geneDescription" title="Cilia And Flagella Associated Protein 74">- Cilia And Flagella Associated Protein 74</span>
</small>]


In [68]:
def clean_html(html):
    """
    Remove HTML markup from the given string.
    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """

    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

In [105]:
entry = {}

for subsection in all_subsections:
  header = subsection.findAll("div",class_="gc-subsection-header")
  if header:
    #print(html2text.html2text(str(header[0])))
    header_txt = clean_html(BeautifulSoup.get_text(header[0]))
    if "Summary" in header_txt:
      summary_index = header_txt.index("Summary")
      #print("end header")
      summary_txt = subsection.findAll('p')
      if not summary_txt:
        li_section = subsection.findAll('li')
        if li_section:
          summary_txt = li_section[0].findAll("div")
      if summary_txt:
        entry[header_txt[:summary_index].strip()]=clean_html(BeautifulSoup.get_text(summary_txt[0])).strip()
      #print("end section")

entry

{'Entrez Gene': 'Predicted to be involved in axoneme assembly. Predicted to be located in axoneme. [provided by Alliance of Genome Resources, Apr 2022]',
 'GeneCards': 'CFAP74 (Cilia And Flagella Associated Protein 74) is a Protein Coding gene.\r\n           Diseases associated with CFAP74 include Familial Cold Autoinflammatory Syndrome 2 and Chromosome 1P36 Deletion Syndrome.',
 'UniProtKB/Swiss-Prot': 'As part of the central apparatus of the cilium axoneme may play a role in cilium movement.\r\n      \nCFA74_HUMAN,Q9C0B2'}

In [119]:
def get_entry(gene):
  genecard_url = 'https://www.genecards.org/cgi-bin/carddisp.pl?gene={}'
  selected_url = genecard_url.format(gene)

  payload={}
  UAS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        )
  ua = UAS[random.randrange(len(UAS))]
  headers = {
    'user-agent':ua
  }
  response = requests.request("GET", selected_url, headers=headers, data=payload)

  soup = BeautifulSoup(response.text, 'html.parser')
  all_divs = soup.findAll("section",id="summaries")
  all_subsections = all_divs[0].findAll("div",class_="gc-subsection")
  
  entry = {}
  h1 = soup.findAll("h1",id="geneSymbol")
  if h1 and h1[0]:
    small = h1[0].findAll("small")
    if small and small[0]:
      entry["Description"] = " ".join(clean_html(BeautifulSoup.get_text(small[0])).split())

  for subsection in all_subsections:
    header = subsection.findAll("div",class_="gc-subsection-header")
    if header:
      #print(html2text.html2text(str(header[0])))
      header_txt = clean_html(BeautifulSoup.get_text(header[0]))
      if "Summary" in header_txt:
        summary_index = header_txt.index("Summary")
        #print("end header")
        summary_txt = subsection.findAll('p')
        if not summary_txt:
          li_section = subsection.findAll('li')
          if li_section:
            summary_txt = li_section[0].findAll("div")
        if summary_txt:
          entry[header_txt[:summary_index].strip()]=" ".join(clean_html(BeautifulSoup.get_text(summary_txt[0])).strip().split())
        #print("end section")

  return entry

In [120]:
get_entry(genes[0])

{'Description': 'Gene - Cilia And Flagella Associated Protein 74',
 'Entrez Gene': 'Predicted to be involved in axoneme assembly. Predicted to be located in axoneme. [provided by Alliance of Genome Resources, Apr 2022]',
 'GeneCards': 'CFAP74 (Cilia And Flagella Associated Protein 74) is a Protein Coding gene. Diseases associated with CFAP74 include Familial Cold Autoinflammatory Syndrome 2 and Chromosome 1P36 Deletion Syndrome.',
 'UniProtKB/Swiss-Prot': 'As part of the central apparatus of the cilium axoneme may play a role in cilium movement. CFA74_HUMAN,Q9C0B2'}

In [121]:
summaries = {}
for i,g in enumerate(genes):
  try:
    summaries[g] = get_entry(g)
  except Exception as e:
    print("Problem with: {}".format(g))
  if i%10 == 0:
    print("Done with {}/{}".format(i,len(genes)))
print(len(summaries))

Done with 0/150
Done with 10/150
Done with 20/150
Done with 30/150
Done with 40/150
Done with 50/150
Done with 60/150
Done with 70/150
Done with 80/150
Done with 90/150
Done with 100/150
Problem with: ENSG00000259450
Done with 110/150
Done with 120/150
Done with 130/150
Done with 140/150
149


In [122]:
summary_data = pd.DataFrame(summaries).transpose()

In [123]:
summary_data.shape

(149, 6)

In [124]:
summary_data

Unnamed: 0,Description,Entrez Gene,GeneCards,UniProtKB/Swiss-Prot,Tocris,CIViC
CFAP74,Gene - Cilia And Flagella Associated Protein 74,Predicted to be involved in axoneme assembly. ...,CFAP74 (Cilia And Flagella Associated Protein ...,As part of the central apparatus of the cilium...,,
PRDM16-DT,Gene - PRDM16 Divergent Transcript,,PRDM16-DT (PRDM16 Divergent Transcript) is an ...,,,
AGMAT,Gene - Agmatinase,Predicted to enable agmatinase activity. Predi...,AGMAT (Agmatinase) is a Protein Coding gene. D...,,,
MACF1,Gene - Microtubule Actin Crosslinking Factor 1,This gene encodes a large protein containing n...,MACF1 (Microtubule Actin Crosslinking Factor 1...,[Isoform 2]: F-actin-binding protein which pla...,,
NFIA,Gene - Nuclear Factor I A,This gene encodes a member of the NF1 (nuclear...,NFIA (Nuclear Factor I A) is a Protein Coding ...,Recognizes and binds the palindromic sequence ...,,
...,...,...,...,...,...,...
TOP1,Gene - DNA Topoisomerase I,"This gene encodes a DNA topoisomerase, an enzy...",TOP1 (DNA Topoisomerase I) is a Protein Coding...,Releases the supercoiling and torsional tensio...,Topoisomerases are ubiquitously expressed enzy...,
EYA2,Gene - EYA Transcriptional Coactivator And Pho...,This gene encodes a member of the eyes absent ...,EYA2 (EYA Transcriptional Coactivator And Phos...,Functions both as protein phosphatase and as t...,,
TSHZ2,Gene - Teashirt Zinc Finger Homeobox 2,This gene is a member of the teashirt C2H2-typ...,TSHZ2 (Teashirt Zinc Finger Homeobox 2) is a P...,Probable transcriptional regulator involved in...,,
BMP7,Gene - Bone Morphogenetic Protein 7,This gene encodes a secreted ligand of the TGF...,BMP7 (Bone Morphogenetic Protein 7) is a Prote...,Growth factor of the TGF-beta superfamily that...,,


In [125]:
summary_data.index.name = 'Gene'
summary_data.reset_index(inplace=True)

In [126]:
summary_data

Unnamed: 0,Gene,Description,Entrez Gene,GeneCards,UniProtKB/Swiss-Prot,Tocris,CIViC
0,CFAP74,Gene - Cilia And Flagella Associated Protein 74,Predicted to be involved in axoneme assembly. ...,CFAP74 (Cilia And Flagella Associated Protein ...,As part of the central apparatus of the cilium...,,
1,PRDM16-DT,Gene - PRDM16 Divergent Transcript,,PRDM16-DT (PRDM16 Divergent Transcript) is an ...,,,
2,AGMAT,Gene - Agmatinase,Predicted to enable agmatinase activity. Predi...,AGMAT (Agmatinase) is a Protein Coding gene. D...,,,
3,MACF1,Gene - Microtubule Actin Crosslinking Factor 1,This gene encodes a large protein containing n...,MACF1 (Microtubule Actin Crosslinking Factor 1...,[Isoform 2]: F-actin-binding protein which pla...,,
4,NFIA,Gene - Nuclear Factor I A,This gene encodes a member of the NF1 (nuclear...,NFIA (Nuclear Factor I A) is a Protein Coding ...,Recognizes and binds the palindromic sequence ...,,
...,...,...,...,...,...,...,...
144,TOP1,Gene - DNA Topoisomerase I,"This gene encodes a DNA topoisomerase, an enzy...",TOP1 (DNA Topoisomerase I) is a Protein Coding...,Releases the supercoiling and torsional tensio...,Topoisomerases are ubiquitously expressed enzy...,
145,EYA2,Gene - EYA Transcriptional Coactivator And Pho...,This gene encodes a member of the eyes absent ...,EYA2 (EYA Transcriptional Coactivator And Phos...,Functions both as protein phosphatase and as t...,,
146,TSHZ2,Gene - Teashirt Zinc Finger Homeobox 2,This gene is a member of the teashirt C2H2-typ...,TSHZ2 (Teashirt Zinc Finger Homeobox 2) is a P...,Probable transcriptional regulator involved in...,,
147,BMP7,Gene - Bone Morphogenetic Protein 7,This gene encodes a secreted ligand of the TGF...,BMP7 (Bone Morphogenetic Protein 7) is a Prote...,Growth factor of the TGF-beta superfamily that...,,


In [127]:
summary_data.to_csv("../../data/output/genes_list.csv",index=False)