# DNA Panda

This is a small script that will import your genome, and query specified genes against NCBI returning a data_frame and .csv with positive matches. 

In [3]:
# Imports
import os
import pandas as pd
from os import listdir
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pylev

import re

import seaborn as sns
sns.set_style('darkgrid')
sns.color_palette('Spectral')
import matplotlib.pyplot as plt


import requests

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

## Import User Data

In [4]:
user_frame = []

user_frame.append(pd.read_csv('data/23andme_MG_v4.txt', sep='\t', 
                   dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, 
                   comment='#'))


data_frame = pd.concat(user_frame, axis=0, ignore_index=True)


#import_frame = pd.read_csv("rccx.csv") 
#merged_frame = pd.concat([data_frame, import_frame], axis=0, sort=True)
#print(merged_frame)
#df = pd.DataFrame(merged_frame)


In [6]:
# Read the data into a pandas DataFrame and do some EDA
df = pd.DataFrame(data_frame)
#df = pd.DataFrame(merged_frame)
df.info
#df = df.fillna("0")

<bound method DataFrame.info of                rsid chromosome  position genotype
0        rs12564807          1    734462       GG
1         rs3131972          1    752721       GG
2       rs148828841          1    760998       CC
3        rs12124819          1    776546       AA
4       rs115093905          1    787173       GT
...             ...        ...       ...      ...
601866     i4000757         MT     16526        G
601867      i701671         MT     16526        G
601868     i4990307         MT     16527        C
601869     i4000756         MT     16540        C
601870     i3001931         MT     16547        C

[601871 rows x 4 columns]>

In [7]:
#df.isna().any()
# How many chromosomes are on the Y chromosome?
df['chromosome'].unique()
Y_chromosome = df[df.chromosome == 'Y']
len(Y_chromosome)
# Show unique counts
df.nunique()


rsid          601871
chromosome        25
position      597380
genotype          20
dtype: int64

In [None]:
## Display how many missing SNPs are in your genome
genotype_na = df[df.genotype == '--']
len(genotype_na)

In [11]:
# Print the length of any chromosome
df6 = df[df.chromosome == "6"]
len(df6)
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40384 entries, 199607 to 239990
Data columns (total 4 columns):
rsid          40384 non-null object
chromosome    40384 non-null object
position      40384 non-null int64
genotype      40384 non-null object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


In [12]:
df6.head()

Unnamed: 0,rsid,chromosome,position,genotype
199607,rs12209455,6,167510,AG
199608,rs6596796,6,175522,GG
199609,rs1535053,6,181089,TT
199610,rs6937355,6,183900,CC
199611,rs6937363,6,183917,CC


In [32]:
# See the frequency of genotypes
#df6['genotype'].value_counts()
df6.count()
notch4 = df6[(df6['position'] >= 32194843) & (df6['position'] <= 32224067)]
notch3 = df[(df['position'] >= 15159038) & (df6['position'] <= 15200995)]
notch4.count()
notch3.count()
notch = pd.concat([notch4, notch3], axis=0, sort=True)
notch.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 210570 to 203706
Data columns (total 4 columns):
chromosome    39 non-null object
genotype      39 non-null object
position      39 non-null int64
rsid          39 non-null object
dtypes: int64(1), object(3)
memory usage: 1.5+ KB


## Isolate the RCCX module

In [33]:
# CYP21A2 :: 32,038,306 to 32,041,670 on chromosome 6 

# tnxb :: 32,041,153 to 32,109,338 

# C4 :: 31,982,057 to 32,002,681

# stk19 :: 31,971,175 to 31,981,446 

# NOTCH 3 :: 15,159,038 to 15,200,995

# NOTCH 4 32,194,843 to 32,224,067

In [34]:
rccx = df6[(df6['position'] >= 31971175) & (df6['position'] <= 32109338)]
rccx = rccx[rccx.genotype != "--"]
rccx.count()



rsid          75
chromosome    75
position      75
genotype      75
dtype: int64

In [35]:
toScan = pd.concat([notch, rccx], axis=0, sort=True)

toScan['genotype'].value_counts()

GG    40
CC    33
TT    13
AA     9
CT     9
AG     5
CG     3
AT     2
Name: genotype, dtype: int64

In [36]:
pd.options.display.max_rows = 999
toScan.count()

chromosome    114
genotype      114
position      114
rsid          114
dtype: int64

## Crawling NCBI

In [21]:
import urllib.request
from bs4 import BeautifulSoup
count = 0
toScan['Parsed'] = "0"

for i, row in toScan.iterrows():
    count = count + 1
    if(row.Parsed != "1"):
        try:
            print("trying...", row.rsid,"(", count, "out of", len(rccx['rsid']),")")
            url = "https://www.ncbi.nlm.nih.gov/snp/" + row.rsid + "#clinical_significance"
            response = urllib.request.urlopen(url)
            html = response.read()
            bs = BeautifulSoup(html, "html.parser")

            classification = bs.find(id="clinical_significance")

            if classification:
                rows = classification.find_all("tr")
                ClinVar = []
                for row in rows:
                    cols = row.find_all("td")
                    cols = [ele.text.strip() for ele in cols]
                    ClinVar.append([ele for ele in cols if ele])
                listToStr = ' '.join([str(elem) for elem in ClinVar]) 

                toScan.at[i, 'ClinVar'] = listToStr

            ncbi = bs.find(class_="summary-box usa-grid-full")
            if ncbi:
                dbSNP = []

                rows = ncbi.find_all("div")

                for row in rows:
                    cols = row.find_all("div")
                    cols = [ele.text.strip() for ele in cols]
                    dbSNP.append(cols)

                try:
                    print("Risk", dbSNP[2][0][0])
                    print("Frequency",dbSNP[2][0][3:7])

                    toScan.at[i, 'Risk'] = dbSNP[2][0][0]
                    toScan.at[i, 'Frequency'] = dbSNP[2][0][3:7]
                except IndexError:
                    print("index error")

                dbSNPTwo= []
                rows = ncbi.find_all("dl")

                for row in rows:
                    cols = row.find_all("dd")
                    cols = [ele.text.strip() for ele in cols]
                    dbSNPTwo.append(cols)

            try:
                print("Gene", dbSNPTwo[1][1].split(' ')[0])
                toScan.at[i, 'Gene'] = dbSNPTwo[1][1].split(' ')[0]
                print("Publications", dbSNPTwo[1][2][0]) 
                toScan.at[i, 'Citations'] = dbSNPTwo[1][2][0]
                toScan.at[i, 'Parsed'] = "1"
            except IndexError:
                    print("index error")


        except urllib.error.HTTPError:
            print(url + " was not found or on dbSNP or contained no valid information")


trying... rs3096691 ( 1 out of 39 )
Risk G
Frequency .459
Gene None
Publications 0
trying... rs482759 ( 2 out of 39 )
Risk G
Frequency .187
Gene None
Publications 0
trying... rs365053 ( 3 out of 39 )
Risk G
Frequency .252
Gene None
Publications 0
trying... rs495089 ( 4 out of 39 )
Risk G
Frequency .340
Gene None
Publications 0
trying... rs436845 ( 5 out of 39 )
Risk G
Frequency .277
Gene None
Publications 0
trying... rs404890 ( 6 out of 39 )
Risk A
Frequency .334
Gene None
Publications 1
trying... rs2849015 ( 7 out of 39 )
Risk A
Frequency .377
Gene None
Publications 0
trying... rs9267873 ( 8 out of 39 )
Risk C
Frequency .193
Gene None
Publications 0
trying... rs3134926 ( 9 out of 39 )
Risk G
Frequency .353
Gene None
Publications 1
trying... rs12182351 ( 10 out of 39 )
index error
Gene None
Publications 0
trying... rs3130299 ( 11 out of 39 )
Risk G
Frequency .240
Gene None
Publications 0
trying... rs549182 ( 12 out of 39 )
Risk A
Frequency .080
Gene None
Publications 0
trying... rs5492

In [None]:
rccx

In [None]:
#rccx.to_csv('rccx.csv', index=False)

In [None]:
rccx_filled = rccx.fillna("0")

In [None]:
rccx_filled

In [None]:
rccx_present = rccx_filled
rccx_present = rccx_filled[rccx_filled.apply(lambda x: x.Risk in x.genotype, axis=1)]

In [None]:
rccx_present

In [24]:
notch
notch_filed = notch.fillna("0")
notch_present = notch_filed
notch_present = notch_filed[notch_filed.apply(lambda x: x.Risk in x.genotype, axis=1)]
notch_present

Unnamed: 0,chromosome,genotype,position,rsid,ClinVar,Risk,Frequency,Gene,Citations
210570,6,AG,32194854,rs3096691,,G,0.459,,0
210573,6,AG,32197463,rs495089,,G,0.34,,0
210574,6,AG,32197736,rs436845,,G,0.277,,0
210577,6,CT,32199352,rs9267873,,C,0.193,,0
210592,6,CT,32211317,rs411326,,T,0.29,,2
210593,6,CT,32212985,rs17576984,,T,0.118,,0
210598,6,CT,32217092,rs6936204,,T,0.237,,2
210602,6,CC,32220484,rs3115572,,C,0.397,,1
210604,6,CT,32223258,rs3130320,,T,0.271,LOC101929163,3


In [25]:
with open('rccx.csv', 'a') as f:
    df.to_csv(f, header=False)