# Validate PubMed Data

The validation should consist of:
- verify gene name (key and field should match)
- verify the organism is homo sapien
- verify the seq length is equal to (stop-start+1)
- verify the positions of the CDS are within start & stop
- verify the start of the CDS starts with "atg"
  - there may be some exceptions to this (ex: TRAC)

# imports & globals 🌎

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import regex as re
from pickle import dump,load
import sys

# Load the data

In [None]:
with open("../data/gene_to_data.pickle","rb") as f:
    data = load(f)
print(len(data))

47


# validate

In [7]:
def validate(data,gene):
    status = []
    if data[gene]["gene"] != gene:
        status.append("gene name not match")
    if data[gene]['organism'] !=  'Homo sapiens':
        status.append("organism not homo sapien")
    if len(data[gene]["seq"]) != int(data[gene]["stop"])-int(data[gene]["start"])+1:
        status.append("seq length not equal to start-stop")
    start_cds = int(data[gene]["CDS"].strip("&lt;").split("..")[0])
    if start_cds+2 > len(data[gene]["seq"]):
        status.append("start cds not in seq")
    elif data[gene]["seq"][start_cds-1:start_cds+2] != "atg":
        status.append("seq doesn't start with atg")
    return status

In [15]:
final_ds = []
exceptions = {"TRAC"}
for g in data:
    stat = validate(data,g)
    if len(stat) != 0:
        print(g,stat)
    if len(stat) == 0 or g in exceptions:
        final_ds.append(data[g])
final_df = pd.DataFrame(final_ds)
print(final_df.shape)
#final_df

TRAC ["seq doesn't start with atg"]
(47, 12)


# write to csv file

In [14]:
final_df.to_csv("../data/gene_data.csv")