# Analyze failed genes

So I have a list of failed genes that I need to investigate further.

# imports & globals 🌎

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re
from pickle import dump,load
import sys
import sqlite3
from collections import Counter
from os.path import exists
import glob
from multiprocessing import Pool

sys.setrecursionlimit(100000)

# Validate the downloaded genes

In [5]:
file_names = [name for name in glob.glob('gathered_data/raw_gene_results/*.pickle')]
print(len(file_names))

15954


In [31]:
def validate(data,gene):
    status = []
    #print(gene)
    if data[gene]["gene"] != gene:
        status.append("gene name not match")
    if data[gene]['organism'] !=  'Homo sapiens':
        status.append("organism not homo sapien")
    if len(data[gene]["seq"]) != int(data[gene]["stop"])-int(data[gene]["start"])+1:
        status.append("seq length not equal to start-stop")
    cds = data[gene]["CDS"].strip("&lt;").split(",")
    start = []
    for s in cds:
        if ".." in s:
            l,r = map(int,s.split(".."))
            if r-l >=3:
                start.extend([l,l+1,l+2])
        else:
            start.append(int(s))
    met = "".join([data[gene]["seq"][i-1] for i in start[:3]])
    if met != "atg":
        status.append("seq doesn't start with atg")
    return status

In [None]:
validate_msgs = []
data_list = []
for file_name in file_names:
    gene = file_name.split("/")[-1].strip(".pickle")
    with open(file_name,"rb") as f:
        data_list.append(load(f))

In [12]:
def get_file_data(fname):
    gene = fname.split("/")[-1].strip(".pickle")
    with open(fname,"rb") as f:
        return gene,load(f)

In [19]:
with Pool(processes=32) as P:
    file_datas = P.map(get_file_data, file_names)
print(len(file_datas))

15954


In [36]:
all_scraped_data = {g:d for g,d in file_datas}
validate_msgs = {k:validate(all_scraped_data,k) for k in all_scraped_data}
print(len(validate_msgs))

15954


In [37]:
Counter("\n".join(validate_msgs[k]) for k in validate_msgs)

Counter({'': 15323,
         "seq doesn't start with atg": 468,
         "gene name not match\norganism not homo sapien\nseq doesn't start with atg": 3,
         "gene name not match\nseq doesn't start with atg": 4,
         'gene name not match\norganism not homo sapien': 85,
         'gene name not match': 56,
         'seq length not equal to start-stop': 12,
         "seq length not equal to start-stop\nseq doesn't start with atg": 3})

In [38]:
for k,m in validate_msgs.items():
    if "seq length not equal to start-stop" in m:
        print(k)

TCAF2
TTC34
CAPN8
GRK1
C1R
CT47A12
CCNB3
DLGAP4
KATNAL2
GALNT9
ECSCR
FAM20C
RILPL1
GAGE12B
RAB38


In [42]:
len(all_scraped_data["TCAF2"]["seq"])

59437

In [44]:
seq_lengths = [len(all_scraped_data[k]["seq"]) for k in all_scraped_data]
print(max(seq_lengths))

448200


In [46]:
# gonna dump the main object, since it takes 15 min to get otherwise :/
with open("gathered_data/all_scraped_data.pickle","wb+") as f:
    dump(all_scraped_data,f)