# Validate PubMed Data

The validation should consist of:
- verify gene name (key and field should match)
- verify the organism is homo sapien
- verify the seq length is equal to (stop-start+1)
- verify the positions of the CDS are within start & stop
- verify the start of the CDS starts with "atg"
  - there may be some exceptions to this (ex: TRAC)

# imports & globals 🌎

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re
from pickle import dump,load
import sys

# Load the data

In [None]:
with open("../data/gene_to_data.pickle","rb") as f:
    data = load(f)
print(len(data))

47


# validate

In [7]:
def validate(data,gene):
    status = []
    if data[gene]["gene"] != gene:
        status.append("gene name not match")
    if data[gene]['organism'] !=  'Homo sapiens':
        status.append("organism not homo sapien")
    if len(data[gene]["seq"]) != int(data[gene]["stop"])-int(data[gene]["start"])+1:
        status.append("seq length not equal to start-stop")
    start_cds = int(data[gene]["CDS"].strip("&lt;").split("..")[0])
    if start_cds+2 > len(data[gene]["seq"]):
        status.append("start cds not in seq")
    elif data[gene]["seq"][start_cds-1:start_cds+2] != "atg":
        status.append("seq doesn't start with atg")
    return status

In [10]:
final_ds = []
exceptions = {"TRAC"}
for g in data:
    stat = validate(data,g)
    if len(stat) != 0:
        print(g,stat)
    if len(stat) == 0 or g in exceptions:
        final_ds.append(data[g])
final_df = pd.DataFrame(final_ds)
print(final_df.shape)
final_df

TRAC ["seq doesn't start with atg"]
(47, 12)


Unnamed: 0,gene,organism,gene_link,gene_bank_url,ncbi_id,start,stop,strand,ncbi_phid,genbank_jquery,CDS,seq
0,ADIPOQ,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/9370,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815595,186842710,186858463,off,CE89552532CE2F510000000003E2017B.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"10350..10563,11475..11995",attctgactgcagtctgtggttctgattccataccagagggtaaga...
1,ZMPSTE24,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/10269,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815597,40258236,40294180,off,CE8CBAE332CF2051000000000113006A.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"37..159,2604..2750,9551..9637,10184..10300,117...",ggtgcacgctgaaggagccggcggaaccgggtggccatggggatgt...
2,TRAC,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/28755,https://www.ncbi.nlm.nih.gov//nuccore/NC_00001...,568815584,22547506,22552132,off,CE8ACB4F32CCD9310000000007030296.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"&lt;1..274,2133..2177,3052..3159",atatccagaaccctgaccctgccgtgtaccagctgagagactctaa...
3,PNPLA3,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/80339,https://www.ncbi.nlm.nih.gov//nuccore/NC_00002...,568815576,43923805,43947582,off,CE8CEE2A32CF6C8100000000004B001C.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"108..294,3131..3363,5020..5085,9074..9283,1080...",agagagcgcttgcgggcgccgggcggagctgctgcggatcaggacc...
4,GPAM,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/57678,https://www.ncbi.nlm.nih.gov//nuccore/NC_00001...,568815588,112149865,112227677,on,CE88042932CE4D6100000000031B0118.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"45894..45995,47083..47205,49621..49694,51965.....",actcacagcaagatgagaggcaactgctcagtgcgtgtcttcagct...
5,TOR1A,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/1861,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815589,129812942,129824136,on,CE8B959832CEEE510000000001730084.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"52..229,1291..1556,5217..5392,5490..5617,9915....",gcaccggttcgcggtcggcgcgagaacaagcagggtggcgcgggtc...
6,PNPLA2,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/57104,https://www.ncbi.nlm.nih.gov//nuccore/NC_00001...,568815587,818914,825573,off,CE8B942D32CD907100000000056A01FE.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"806..992,2715..2947,3045..3110,3484..3693,4614...",agacgcaggcagccccaaagcctgaacaggcagggccagacccagg...
7,HSD17B11,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/51170,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815594,87336515,87391188,on,CE89C80332CD6E4100000000061E023F.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"119..328,8827..8934,16359..16490,18374..18480,...",agttcctccttgctctcgcccctactctttctggtgttagatcgag...
8,ACACA,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/31,https://www.ncbi.nlm.nih.gov//nuccore/NC_00001...,568815581,37084992,37406836,on,CE8C96B532CEEEA10000000001A6009C.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"538..575,66987..67033,76502..76664,121867..121...",gccgcctccgcccctcggccgtggaggcccccgccgggtgctgagc...
9,HADHB,Homo sapiens,https://www.ncbi.nlm.nih.gov/gene/3032,https://www.ncbi.nlm.nih.gov//nuccore/NC_00000...,568815596,26244939,26290465,off,CE8C223B32CE4FC10000000003700130.m_23,https://www.ncbi.nlm.nih.gov/sviewer/viewer.fc...,"9317..9380,9492..9536,18442..18541,25015..2505...",acttggacctgaaccttgctccgagagggagtcctcgcggacgtca...


# write to csv file

In [14]:
final_df.to_csv("../data/gene_data.csv")