# Prep Big Gene Acquisition

So the 47 gene list that Prabhu gave earlier was a nice way to test all the small components of this project; however, ultimately I want this tool to be able to run against all human genes.

So we need to start storing the results from massive sets of downloads.

There are ~28,000 human genes. Rendundancy in the system is a must & before scaling up to 10,000 genes, I need to make sure 100 genes & 500 genes run fine.

# Imports and Globals 🌎

In [18]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re
from pickle import dump,load
import sys
import sqlite3
from collections import Counter
sys.setrecursionlimit(100000)

# get old working code

In [4]:
def get_bp_and_cds(gene):
    result = {}
    payload={}

    UAS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
           "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
           "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
           )

    ua = UAS[random.randrange(len(UAS))]
    headers = {
      'user-agent':ua
    }

    response1 = requests.request("GET", "https://www.ncbi.nlm.nih.gov/gene/?term={}".format(gene), headers=headers, data=payload)
    soup1 = BeautifulSoup(response1.text, 'html.parser')
    titles = soup1.findAll("a",id="feat_gene_title")
    result["gene"] = titles[0].contents[0].split()[0]
    taxons = soup1.findAll("p",class_="ncbi-doc-authors")
    result["organism"] = taxons[0].find("i").contents[0]
    result["gene_link"] = titles[0]["href"]
    
    response2 = requests.request("GET", titles[0]["href"], headers=headers, data=payload)
    soup2 = BeautifulSoup(response2.text, 'html.parser')
    result["gene_bank_url"] = "https://www.ncbi.nlm.nih.gov/"+soup2.findAll("a",title="Nucleotide GenBank report")[0]["href"]
    
    response3 = requests.request("GET", result["gene_bank_url"], headers=headers, data=payload)
    soup3 = BeautifulSoup(response3.text,"html.parser")
    soup3.findAll("meta",{"name":"ncbi_uidlist"})[0]["content"]
    result["ncbi_id"] = soup3.findAll("meta",{"name":"ncbi_uidlist"})[0]["content"]
    result["start"],result["stop"] = re.search(re.compile("from=(\d+)&to=(\d+)"),result["gene_bank_url"]).groups()
    result["strand"] = "on" if "strand=true" in result["gene_bank_url"] else "off"
    result["ncbi_phid"] = soup3.findAll("meta",{"name":"ncbi_phid"})[0]["content"].split()[0]
    
    target_url_template = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id={}&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from={}&to={}&strand={}&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000"
    #target_url_template2 = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id={}&db=nuccore&report=genbank&=&from={}&to={}&retmode=html&ncbi_phid={}&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000"
    result["genbank_jquery"] = target_url_template.format(result["ncbi_id"],result["start"],result["stop"],result["strand"])
    print(result["gene_bank_url"])
    print(result["genbank_jquery"])
    response4 = requests.request("GET", result["genbank_jquery"], headers=headers, data=payload)
    print(len(response4.text))
    m = re.search(re.compile(r"CDS\s+join\(((?:&|l|t|;|\.|,|\d|\s)+)\)"),response4.text)
    result["CDS"] = "".join(m.groups()[0].split())
    soup4 = BeautifulSoup(response4.text,"html.parser")
    result["seq"] = "".join(["".join(s.contents[0].split()) for s in soup4.findAll("span",class_="ff_line")])
    print("done!")
    return result

In [5]:
# test that it works on my loci of interest
get_bp_and_cds("SLC5A6")

https://www.ncbi.nlm.nih.gov//nuccore/NC_000002.12?report=genbank&from=27199587&to=27212787&strand=true
https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=568815596&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from=27199587&to=27212787&strand=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000
107156
done!


{'gene': 'SLC5A6',
 'organism': 'Homo sapiens',
 'gene_link': 'https://www.ncbi.nlm.nih.gov/gene/8884',
 'gene_bank_url': 'https://www.ncbi.nlm.nih.gov//nuccore/NC_000002.12?report=genbank&from=27199587&to=27212787&strand=true',
 'ncbi_id': '568815596',
 'start': '27199587',
 'stop': '27212787',
 'strand': 'on',
 'ncbi_phid': 'CE8986BD335BA1D10000000005AE014C.m_23',
 'genbank_jquery': 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=568815596&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from=27199587&to=27212787&strand=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000',
 'CDS': '5138..5530,5846..5911,6254..6305,6695..6762,7284..7438,7857..7997,8198..8327,8921..9009,9443..9555,9908..9975,10714..10800,10941..11122,12209..12398',
 'seq': 'accgcacgatatatggcgttcgtgcgtagagagattacacaagaggcagaggacccgtctgtagaaagaacttaaagttattctaaaattaaggggcgaaatcaagtttcacaggtctttgaaagtcggcaggactggggcgccgaaggcctgcacgacgtgagtacttgggaacaggacctggctgggcagggtcggg

# get list of all human genes

In [7]:
connection = sqlite3.connect("../data/EnsDb.Hsapiens.v79.sqlite")
cursor = connection.cursor()

In [10]:
rows = cursor.execute("SELECT gene_name,gene_biotype FROM gene").fetchall()
print(len(rows))

65774


In [12]:
Counter(b for g,b in rows)

Counter({'protein_coding': 22002,
         'unitary_pseudogene': 169,
         'unprocessed_pseudogene': 3137,
         'processed_pseudogene': 10702,
         'processed_transcript': 767,
         'transcribed_unprocessed_pseudogene': 751,
         'antisense': 5648,
         'polymorphic_pseudogene': 79,
         'lincRNA': 7839,
         'sense_intronic': 931,
         'transcribed_processed_pseudogene': 478,
         'sense_overlapping': 199,
         'IG_V_pseudogene': 190,
         'pseudogene': 43,
         'translated_unprocessed_pseudogene': 1,
         'TR_V_gene': 106,
         'transcribed_unitary_pseudogene': 1,
         '3prime_overlapping_ncrna': 29,
         'IG_V_gene': 154,
         'snRNA': 2032,
         'miRNA': 4548,
         'misc_RNA': 2468,
         'snoRNA': 1017,
         'rRNA': 564,
         'Mt_tRNA': 22,
         'Mt_rRNA': 2,
         'IG_C_gene': 14,
         'IG_J_gene': 18,
         'TR_J_gene': 73,
         'TR_C_gene': 5,
         'TR_V_pseudogene':

Wowza, that's a lot of genes. I will focus on the protein coding genes for now as I think they are the most relevant to the project.

In [13]:
protein_coding_genes = [g for g,b in rows if b == "protein_coding"]
print(len(protein_coding_genes))

22002


In [14]:
problem_file = "gathered_data/problem_genes.list"

In [15]:
test_genes = protein_coding_genes[:1000]

In [None]:
for gene in test_genes:
    try:
        print(gene)
        result = get_bp_and_cds(gene)
        with open("gathered_data/raw_gene_results/{}.pickle","wb+".format(gene)) as f:
            dump(result,f)
    except Exception as e:
        print("failed")
        print(e)
        with open(problem_file,"a+") as f:
            f.write("{}\n".format(gene))

TSPAN6
https://www.ncbi.nlm.nih.gov//nuccore/NC_000023.11?report=genbank&from=100627108&to=100637104&strand=true
https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=568815575&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from=100627108&to=100637104&strand=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000
51679
done!
TNMD
https://www.ncbi.nlm.nih.gov//nuccore/NC_000023.11?report=genbank&from=100584936&to=100599885
https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=568815575&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from=100584936&to=100599885&strand=off&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000
48633
done!
DPM1
https://www.ncbi.nlm.nih.gov//nuccore/NC_000020.11?report=genbank&from=50934855&to=50958564&strand=true
https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=568815578&db=nuccore&report=genbank&conwithfeat=on&basic_feat=on&hide-cdd=on&from=50934855&to=50958564&strand=on&ret