# analysis of SNPs in 70 genes associated with kidney, liver, small intestine transport and metabolism

### Import dependencies

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

### construct gene profile array

scrape bookmarked gnomAD pages for gene names and IDs

In [2]:
soup = BeautifulSoup(open('gnomad bookmarks.html'))
links = soup.find_all('a')

make 2D array of gene profiles with [gene name, gene ID]

In [3]:
gene_profiles = []
for link in links:
    gene_info = [[link.text[:-9], link.get('href')[39:54]]]
    gene_profiles += gene_info

make array of corresponding CSV filenames

In [4]:
gene_files = os.listdir('geneCSVs')

add corresponding CSV filename to gene profile [gene name, gene ID, corresponding file]

In [5]:
for gene_profile in gene_profiles:
    for gene_file in gene_files:
        if gene_profile[1] in gene_file[14:29]:
            gene_profile += [str(gene_file)]

### read data into dataframe(s)

loop over profiles, read corresponding CSV into DF, load DFs into array

In [6]:
gene_variant_tables = []
for gene_profile in gene_profiles:
    table = pd.read_csv('geneCSVs/' + gene_profile[2])
    table['gene'] = gene_profile[0]
    table['geneID'] = gene_profile[1]
    table.set_index('gene', inplace = True)
    gene_variant_tables += [table]

### concatenate dataframes, keeping gene names as index

In [7]:
frames = gene_variant_tables

In [8]:
complete_table = pd.concat(frames)

### write text file list of rsIDs (column index 2) for batch querying

make rsID array from column in main data frame

rsIDlist = []
for x in range(len(complete_table.values)):
    rsIDlist += [complete_table.values[x][2]]

write rsID array to lines in text file

with open('listrsIDs.txt', 'w') as IDfile:
    for ID in rsIDlist:
        IDfile.write("%s\n" % ID)

(use text file for PolyPhen-2 batch query at http://genetics.bwh.harvard.edu/pph2/bgi.shtml and SIFT prediction query at https://sift.bii.a-star.edu.sg/www/SIFT_dbSNP.html)

### prepare main dataframe for adding PolyPhen-2 and SIFT annotations

make columns for PolyPhen-2 score, PolyPhen-2 prediction, SIFT score, and SIFT prediction

In [9]:
complete_table['SIFT_prediction'] = ''

In [10]:
complete_table['PPH2_prediction'] = ''

### prepare array with rsIDs, referenceNTs, alternateNTs, SIFT predictions, and PolyPhen-2 predictons - SNPs prediction profiles

(obtain SIFT results in html file - scrape)

scrape rsID, reference nucleotide, alternate nucleotide, and SIFT prediction into SIFT profiles 2D array

In [11]:
soup = BeautifulSoup(open('SIFT-scores.html'))

In [12]:
rows = soup.find_all('tr')

In [13]:
SIFT_profiles = []
for row in rows[1:]:
    profile = [row.find_all('td')[0].text, \
    row.find_all('td')[4].text, \
    row.find_all('td')[5].text, \
    row.find_all('td')[15].text]
    # rsID, referenceNT, alternateNT, SIFTprediction
    SIFT_profiles += [profile]

(obtain PolyPhen-2 results in tab separated text file)

add PolyPhen-2 predictions into prediction profiles array

In [14]:
pph2_data = pd.read_csv('pph2-full.txt', sep='\t')

clean the spaces (there are so many wow)

In [15]:
for row in range(len(pph2_data.values)):
    for column in range(len(pph2_data.columns)):
        pph2_data.iloc[row,column] = str(pph2_data.iloc[row,column]).strip()

rsID, reference nucleotide, alternate nucleotide, and PolyPhenol-2 prediction into PPH2 profiles 2D array

In [16]:
PPH2_profiles = []
for row in range(len(pph2_data.values)):
    profile = [pph2_data.values[row][0], \
    pph2_data.iloc[row,9], \
    pph2_data.iloc[row,10], \
    pph2_data.iloc[row,11]]    
    # rsID, referenceNT, alternateNT, PPH2 prediction
    PPH2_profiles += [profile]

take off last few lines which are not SNP data

In [17]:
PPH2_profiles = PPH2_profiles[:-5]

### append main dataframe with predictions

matching requirement for both SIFT and PolyPhenol-2: rsID, refNT, and altNT (rsID alone not unique since >1 altNT)

In [None]:
for row in range(len(complete_table.values)):
    for SIFTrow in range(len(SIFT_profiles)):
        if complete_table.iloc[row,2] == SIFT_profiles[SIFTrow][0] and \
        complete_table.iloc[row,3] == SIFT_profiles[SIFTrow][1] and \
        complete_table.iloc[row,4] == SIFT_profiles[SIFTrow][2]:
            complete_table.iloc[row,51] = SIFT_profiles[SIFTrow][3]
    for PPH2row in range(len(PPH2_profiles)):
        if complete_table.iloc[row,2] == PPH2_profiles[PPH2row][0] and \
        complete_table.iloc[row,3] == PPH2_profiles[PPH2row][1] and \
        complete_table.iloc[row,4] == PPH2_profiles[PPH2row][2]:
            complete_table.iloc[row,52] = PPH2_profiles[PPH2row][3]

In [26]:
PPH2_profiles[0]

['rs45607933', 'C', 'T', 'benign']

In [27]:
complete_table.values[0]

array([11, 62744274, 'rs371714911', 'C', 'A',
       'gnomAD Exomes,gnomAD Genomes', 'PASS', 'PASS', 'p.Gly562Ter',
       'p.Gly562Ter', 'c.1684G>T', 'stop_gained', 'lc_lof', 9, 281000,
       3.202846975088968e-05, 0, 0, 0, 24778, 0, 0, 0, 35240, 0, 0, 0,
       10286, 0, 0, 0, 19924, 0, 0, 0, 25024, 0, 0, 9, 128350, 0, 0, 0,
       7188, 0, 0, 0, 30210, 0, 0, 'ENSG00000197901', '', ''],
      dtype=object)

In [28]:
complete_table.columns[51]

'SIFT_prediction'

In [29]:
PPH2_profiles[0]

['rs45607933', 'C', 'T', 'benign']

In [30]:
len(SIFT_profiles)

3139

add columns:
SIFT using sequence,
ortholog,
VP