# Gene Lists

In [1]:
# Genes of interest - some test genes 
TEST_GENES = [
    'TP53',
    'ERBB2',  # Herceptin target
    'EGFR',
    'AKT1',
    'KRAS',
    'PTEN',
    'APOE',
]

TUMOUR_MUT_GENES = [  # www.tumourportal.org - highly mutated
    'TP53',
    'PIK3CA',
    'PTEN',
    'KRAS',
    'APC',
    'MLL3',  # aka KMT2C
    'KMT2C',
    'FAT1',
    'MLL2',  # aka KMT2D
    'KMT2D',
    'ARID1A',
    'VHL',
    'PBRM1',
    'NF1',
    'EGFR',
    'ATM',
    'PIK3R1',
    'BRAF',
    'CDKN2A',
    'SETD2',
    'CREBBP',
    'FBXW7',
    'SPEN',
    'MTOR',
    'RB1',
    'SMARCA4',
    'NOTCH1',
]

In [2]:
import pandas as pd
# !pip install -U lxml
# !pip install -U html5lib 

In [3]:
from logzero import logger
import os
import pathlib
import requests
from tqdm import tqdm

UNIPROT_URL = "http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz"
HGNC_URL = "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"

script_dir = pathlib.Path().resolve()
cache_dir = os.path.join(script_dir, "data")

if os.path.exists(cache_dir):
    logger.info(f"Found {cache_dir}")
else:
    os.mkdir(cache_dir)

for url in [UNIPROT_URL, HGNC_URL]:
    dest = os.path.join(cache_dir, os.path.basename(url))
    dest_unzip = os.path.splitext(dest)[0]
    if os.path.exists(dest):
        logger.info(f"found existing: {dest}")
    elif os.path.exists(dest_unzip):
        logger.info(f"found existing: {dest_unzip}")
    else:
        logger.info(f"Downloading {dest}")
        # Open a handle onto the GTEx expression data
        response = requests.get(url, stream=True)

        with open(dest, "wb") as fh:
            for data in tqdm(response.iter_content()):
                fh.write(data)
        logger.info(f"Completed {dest}")

[I 230309 15:07:23 3651806166:14] Found /home/dustin/fleet_gene_update/data
[I 230309 15:07:23 3651806166:24] found existing: /home/dustin/fleet_gene_update/data/uniprot_sprot.xml
[I 230309 15:07:23 3651806166:22] found existing: /home/dustin/fleet_gene_update/data/hgnc_complete_set.txt


In [4]:
# https://www.genenames.org/download/archive/
# Loading straight from the website works but is wasteful.  Use a cached file
# hgnc = pd.read_csv("http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt", sep="\t", low_memory=False)
hgnc = pd.read_csv(os.path.join(cache_dir, os.path.basename(HGNC_URL)), sep="\t", low_memory=False)

In [5]:
hgnc.head()

Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,...,,,,,,A1BG-AS1,,HGNC:37133,,
2,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
3,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,...,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,...,,,,,,A2M-AS1,,HGNC:27057,,


In [12]:
import pandas as pd
import os

# https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot.xsd

uniprot_fn = os.path.join(cache_dir, os.path.basename(UNIPROT_URL))
uniprot_unzip = os.path.splitext(uniprot_fn)[0]
uniprot = pd.read_xml(uniprot_unzip, iterparse={"entry":["name", "accession", "geneLocation", "keyword", "feature"]})

In [13]:
uniprot

Unnamed: 0,accession,name,feature,keyword,geneLocation
0,P0C9F0,Rock D.L.,,,
1,P0C9F1,Rock D.L.,,,
2,P0C9F2,Rock D.L.,,,
3,P0C9E9,Rock D.L.,,,
4,Q65209,J. Virol.,,Reference proteome,
...,...,...,...,...,...
568739,A5H447,Dev. Dyn.,,Zinc,
568740,Q0VA45,NIH - Xenopus Gene Collection (XGC) project,,Zinc,
568741,Q9NVB9,Zn(2+),,Zinc-finger,
568742,Q8CI54,Zn(2+),,Zinc-finger,


In [19]:
uniprot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568744 entries, 0 to 568743
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   accession     568744 non-null  object 
 1   name          568744 non-null  object 
 2   feature       0 non-null       float64
 3   keyword       560494 non-null  object 
 4   geneLocation  0 non-null       float64
dtypes: float64(2), object(3)
memory usage: 21.7+ MB
