Count Snps

Code used to count the number of SNPs, differentated between location on exons or introns based on the start and stop coordinates of the gene and exons in Ensembl.
Gathers gene, exon, and SNP information from the Ensembl REST API, from the canonical transcript
Input: List of official gene symbols
Output: Data frame with gene information, exon information, and SNP information

In [9]:
# use UCSC human genome browswer to count the number of SNPs across the genome
# gather a total, but also seperate by intron and exon 

import pandas as pd
import json 
import requests
import sys
import statistics as stats

In [1]:
# read in the protein coding genome list
genomeFile = open(r"WGCNA_gene_lists\20754_human_protein_coding_genes.txt").read()
gene_list_full = genomeFile.split(', ')




In [11]:
def get_exons(gene_list):
    # function for sending dumped data to ensembl api
    def fetch_endpoint_POST(server, request, data, content_type):
        r = requests.post(server+request,
                        headers={ "Accept" : content_type},
                        data=data )
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        # return results
        if content_type == 'application/json':
            return r.json()
        else:
            return r.text

    # define the server, extension and content type
    server = "http://rest.ensembl.org/"
    con = "application/json"
    ext_symbols = "lookup/symbol/homo_sapiens/"
    # create the list of gene symbols
    # convert the list into json format
    data = json.dumps({ "symbols" : gene_list })
    # run the query
    post_lookup_symbols = fetch_endpoint_POST(server, ext_symbols, data, con)
    # Load the data into a pandas dataframe
    genes = pd.DataFrame.from_dict(post_lookup_symbols, orient="index")
    Ensembl_symbols = genes.loc[:, 'id']
    ID_start_end= pd.DataFrame(Ensembl_symbols)
    # this generates a dataframe with all of the ensembl IDs for the gene symbols supplied

    EnsID = ID_start_end["id"]
    
    # fetch the canonical transcript ID from each gene to filter exons
    canon_t_ids = []
    canon_t_starts = []
    canon_t_ends = []

    for i in EnsID:
        ext = "/overlap/id/"+i+"?feature=transcript;species=homo_sapiens"
        r = requests.get(server+ext,headers={ "Content-Type" : "application/json"})
        decoded = r.json()

        if not r.ok:
            r.raise_for_status()
            sys.exit()
        transcripts_decoded = r.json()
        # format into a dataframe and grab the canonical transcript id
        transcripts = pd.DataFrame(transcripts_decoded)

        # fetch the start, end, and id of the canonical transcript - append them to the collection lists
        canon_t_starts.append(transcripts.loc[(transcripts['is_canonical'] == 1) & (transcripts['Parent'] == i), 'start'].item())
        canon_t_ends.append(transcripts.loc[(transcripts['is_canonical'] == 1) & (transcripts['Parent'] == i), 'end'].item())
        canon_id = transcripts.loc[(transcripts['is_canonical'] == 1) & (transcripts['Parent'] == i), 'id'].item()
        canon_t_ids.append(canon_id)
    # assign the accumulator lists as columns of the data frame
    ID_start_end['canon transcript id'] = canon_t_ids
    ID_start_end['start'] = canon_t_starts
    ID_start_end['end'] = canon_t_ends

    total_exon_data = []
    total_exon_length_list = []
    for i in EnsID:
        ext = "/overlap/id/"+i+"?feature=exon;species=homo_sapiens"
        r = requests.get(server+ext,headers={ "Content-Type" : "application/json"})
        decoded = r.json()

        if not r.ok:
            r.raise_for_status()
            sys.exit()
        decoded = r.json()
        exons = pd.DataFrame(decoded)
        # filter the exon results for only those that originate from the canonical transcript
        exons_canon = exons.loc[exons['Parent'] == ID_start_end.loc[ID_start_end['id'] == i, 'canon transcript id'].item(), :]
        exons_canon = exons_canon.reset_index(drop=True)

        # turn results into a dictionary with exon id as keys mapping to the start and stop of the exon
        exon_dic = {}
        total_exon_length = 0
        for rowInd in range(len(exons_canon)):
            exon_dic[exons_canon.loc[rowInd, 'exon_id']] = (exons_canon.loc[rowInd, 'start'], exons_canon.loc[rowInd, 'end'])
            total_exon_length += exons_canon.loc[rowInd, 'end'] - exons_canon.loc[rowInd, 'start']

        total_exon_data.append(exon_dic)
        total_exon_length_list.append(total_exon_length)
        
    # add the list of dictionaries as a column of the data frame, and a measure of the length of the gene
    ID_start_end['exon start stop'] = total_exon_data
    ID_start_end.insert(4, 'gene length', ID_start_end['end'] - ID_start_end['start'])
    ID_start_end.insert(6, 'total exon length', total_exon_length_list)

    ID_start_end_exon = ID_start_end.reset_index()
    ID_start_end_exon = ID_start_end_exon.rename({'index':'gene symbol'}, axis=1)

    #print('exons found')
    return ID_start_end_exon



In [12]:
def count_snps(ens_df):
    # iterate over the ensemble id and exon dataframe, grab the SNPs for each gene and clean the dataframe
    total_snp_exon_count = []
    count_df_list = []

    EnsID = ens_df['id']

    for i in EnsID:
        # get the row number from the exon dataframe
        ID_index = ens_df['id'].tolist().index(i)

        # get the report of SNPs from the start and stop coordinates of the canonical transcript
        server = "http://rest.ensembl.org/"
        ext = "/overlap/id/"+i+"?feature=variation;species=homo_sapiens"
        r = requests.get(server+ext,headers={ "Content-Type" : "application/json"})
        decoded = r.json()

        if not r.ok:
            r.raise_for_status()
            sys.exit()
        decoded = r.json()

        snps = pd.DataFrame(decoded)
    # ADD THIS CONDITIONAL
        if len(snps) > 0:
            snps['length'] = snps['end'] - snps['start']
            # remove the data that we do not need
            snps= snps.drop(labels=['seq_region_name', 'feature_type', 'source', 'strand', 'alleles', 'assembly_name'], axis=1)

            levels = list(set(snps['consequence_type']))
            # count the number of occurences of each consequence of the SNP - may be able to use for exon/intron differentation at a finer grain
            levels_count = {}
            for level in levels:
                matches = sum(snps['consequence_type'] == level)
                levels_count[level] = matches

            levels_df = pd.DataFrame(levels_count, index=[i])

            count_df_list.append(levels_df)

            # accumulators for exon or non exon snps
            snp_exon = 0
            snp_n_exon = 0
            # get the exon range dictionary
            exon_dic = ens_df.loc[ID_index, 'exon start stop']
            # alternate method: count the snps against the list of exons 
            for rowInd in range(len(snps)):
                in_exon = False
                snp_start = snps.loc[rowInd, 'start']
                snp_end = snps.loc[rowInd, 'end']
                for key in exon_dic.keys():
                    # if snp location is inside the exon add and break the loop
                    if snp_start >= exon_dic[key][0] and snp_end <= exon_dic[key][1]:
                        in_exon = True
                        break
                # if the count is equal to the length of the keys it means it's in an intron
                if in_exon:
                    snp_exon += 1
                else:
                    snp_n_exon += 1


        # ADD THIS
        else:
            snp_exon = None
            snp_n_exon = None

            temp_df = pd.DataFrame(data = None, columns=['missense_variant'], index=[i])
            count_df_list.append(temp_df)
        # END ADD SEGMENT
            
        # append to the accumulator  with ensemble id for binding
        total_snp_exon_count.append([i, snp_exon, snp_n_exon,])

    count_2_df = pd.DataFrame(data=total_snp_exon_count, columns=['id', 'exon snp', 'not exon snp'])    


    total_count_df = pd.concat(count_df_list, axis=0, ignore_index=True)
    total_count_df['Total SNPs'] = total_count_df.sum(axis=1, numeric_only=True)

    # merge the two data frames together through a temporary column, then remove the column
    ens_df['temp'] = list(range(len(ens_df)))
    total_count_df['temp'] = list(range(len(total_count_df)))


    total_output = ens_df.merge(total_count_df, on='temp', copy=True)
    total_output = total_output.drop(labels='temp', axis=1)


    total_output2 = total_output.merge(count_2_df, on='id', copy=True)

    return total_output2

In [13]:
test_chunk = gene_list_full[50:60]

exon_check = get_exons(test_chunk)

snp_check = count_snps(exon_check)

display(snp_check)

Unnamed: 0,gene symbol,id,canon transcript id,start,end,gene length,exon start stop,total exon length,splice_donor_region_variant,3_prime_UTR_variant,...,splice_donor_variant,frameshift_variant,inframe_insertion,non_coding_transcript_exon_variant,splice_donor_5th_base_variant,protein_altering_variant,coding_sequence_variant,Total SNPs,exon snp,not exon snp
0,CNNM2,ENSG00000148842,ENST00000369878,102918294,103090222,171928,"{'ENSE00001788039': (102918294, 102920101), 'E...",15849,4,5141,...,2.0,25,11.0,105.0,2.0,1.0,,62637.0,6205,56432
1,CACNA1S,ENSG00000081248,ENST00000362061,201039512,201112426,72914,"{'ENSE00004020756': (201112188, 201112426), 'E...",5984,59,178,...,45.0,96,4.0,292.0,21.0,3.0,3.0,28935.0,2949,25986
2,MYO19,ENSG00000278259,ENST00000614623,36495636,36534868,39232,"{'ENSE00003716040': (36534761, 36534868), 'ENS...",3905,43,1331,...,42.0,162,4.0,2358.0,17.0,,1.0,20355.0,1818,18537
3,MFRP,ENSG00000235718,ENST00000619721,119338942,119346705,7763,"{'ENSE00003549514': (119346460, 119346705), 'E...",3934,16,364,...,11.0,71,4.0,53.0,9.0,,,3648.0,1909,1739
4,ANGPTL6,ENSG00000130812,ENST00000253109,10092338,10102678,10340,"{'ENSE00001383912': (10102568, 10102678), 'ENS...",1779,5,271,...,7.0,44,2.0,77.0,5.0,3.0,1.0,4406.0,921,3485
5,DCLK1,ENSG00000133083,ENST00000360631,35768652,36131382,362730,"{'ENSE00003901953': (36131114, 36131382), 'ENS...",8377,14,3723,...,4.0,8,2.0,228.0,2.0,3.0,2.0,129464.0,2904,126560
6,TVP23C,ENSG00000175106,ENST00000518321,15536980,15563483,26503,"{'ENSE00002137998': (15563437, 15563483), 'ENS...",4073,10,1738,...,16.0,28,,232.0,2.0,,1.0,21993.0,1492,20501
7,GPR12,ENSG00000132975,ENST00000405846,26755200,26760786,5586,"{'ENSE00001555433': (26760579, 26760786), 'ENS...",4849,1,1243,...,,15,1.0,,,1.0,,2090.0,1766,324
8,ESR1,ENSG00000091831,ENST00000206249,151807682,152103274,295592,"{'ENSE00001877305': (151807682, 151808364), 'E...",6319,30,3228,...,22.0,28,3.0,809.0,11.0,,,168255.0,3212,165043
9,ABCB1,ENSG00000085563,ENST00000622132,87503017,87600884,97867,"{'ENSE00003748396': (87600755, 87600884), 'ENS...",5177,49,462,...,33.0,45,4.0,835.0,17.0,1.0,1.0,76804.0,2260,74544


In [45]:
# function interact with - accepts the list of strings for the genome, a start index, and a stop index
def run_gene_list_chunk(gene_list, index_start, index_stop):
    # index the list of 
    gene_chunk = gene_list[index_start:index_stop]
    exon_chunk = get_exons(gene_chunk)
    snp_results = count_snps(exon_chunk)

    return snp_results
