### The updated product of VAPr formatting.py
- Created after ATAC-seq
- Compatible with Python 2 and 3 (by using a copy library)
- Processes large inputs by breaking it into batches

## Import libraries

In [1]:
from VAPr import vapr_core
import os
from pymongo import MongoClient
import pandas as pd
import numpy as np

In [2]:
IN_PATH = "."
OUT_PATH = "./results"
ANNOVAR_PATH = "/shared/workspace/software/annovar"
MONGODB = 'VariantDatabase'
COLLECTION = 'Cancer'

In [5]:
annotator = vapr_core.VaprAnnotator(input_dir=IN_PATH,
                                    output_dir=OUT_PATH,
                                    mongo_db_name=MONGODB,
                                    mongo_collection_name=COLLECTION,
                                    build_ver='hg19',
                                    vcfs_gzipped=False,
                                    annovar_install_path=ANNOVAR_PATH)

## Download Annovar Databases

In [None]:
annotator.download_annovar_databases()

## Drop MongoDB Collections

In [8]:
c = MongoClient()
c.test_database
db = c['VariantDatabase']
db.list_collection_names()

[u'Cancer']

In [12]:
db.drop_collection('Cancer')

{u'nIndexesWas': 1, u'ns': u'VariantDatabase.Cancer', u'ok': 1.0}

## Run Annovar and MyVariant.info queries, upload variant annotations to MongoDB

In [7]:
dataset = annotator.annotate(num_processes=8)

100%|██████████| 8/8 [00:19<00:00,  2.39s/it]


## Skip Annovar step, export only MyVariant data to MongoDB

In [None]:
dataset_light = annotator.annotate_lite(num_processes=8)

## Query Rare Deleterious Variants

In [8]:
dataset = vapr_core.VaprDataset(MONGODB, COLLECTION)

In [9]:
rare_deleterious_variants = dataset.get_rare_deleterious_variants()
rare_deleterious_variants[14]

{u'1000g2015aug_all': 0.00838658,
 u'_id': ObjectId('5c06f3572c788e6b1668bf12'),
 u'alt': u'A',
 u'cadd': {u'1000g': {u'af': 0.01, u'afr': 0.01, u'amr': 0.01, u'eur': 0.03},
  u'_license': u'http://goo.gl/bkpNhq',
  u'esp': {u'af': 0.015, u'afr': 0.005, u'eur': 0.02},
  u'gerp': {u'n': 5.45, u'rs': 613.1, u'rs_pval': 7.78414e-78, u's': 5.45},
  u'phred': 23.1,
  u'polyphen': {u'cat': u'benign', u'val': 0.031},
  u'sift': {u'cat': u'tolerated', u'val': 0.4}},
 u'chr': u'14',
 u'dbsnp': {u'_license': u'https://goo.gl/Ztr5rl', u'rsid': u'rs77202343'},
 u'end': 24803710,
 u'exonicfunc_knowngene': u'nonsynonymous SNV',
 u'func_knowngene': u'exonic',
 u'gene_knowngene': u'ADCY4',
 u'hgvs_id': u'chr14:g.24803710G>A',
 u'ref': u'G',
 u'samples': [{u'AD': [292, 0],
   u'filter_passing_reads_count': 292,
   u'genotype': None,
   u'sample_id': u'N7_vs_T7'}],
 u'start': 24803710,
 u'wellderly': {u'_license': u'https://goo.gl/e8OO17',
  u'alleles': [{u'allele': u'A', u'freq': 0.025},
   {u'allele':

### Filter #1: Rare deleterious variants

In [10]:
# Apply filter.
rare_deleterious_variants = dataset.get_rare_deleterious_variants()
len(rare_deleterious_variants)

15

### Filter #2: Known disease variants

In [11]:
# Apply filter.
known_disease_variants = dataset.get_known_disease_variants()

In [12]:
len(known_disease_variants)

80

### Filter #3: Deleterious compound heterozygous variants

In [13]:
# Apply filter
deleterious_compound_heterozygous = dataset.get_deleterious_compound_heterozygous_variants()

In [14]:
len(deleterious_compound_heterozygous)

0

### Filter #4: De novo variants

In [15]:
# Apply filter.
denovo_variants = dataset.get_de_novo_variants(proband="NA12878",
                                               ancestor1="NA12891",
                                               ancestor2="NA12892")
denovo_variants[0]

IndexError: list index out of range

## Output Files

## Streamlined pipeline

### Extract samples

In [16]:
# Take each sample out from the sample column and append it in its individual row
def extract_samples(dataset_list_in):
    dataset_list_out = []  # output list

    # for every row 
    for x in range(len(dataset_list_in)):
        temp_sample_list = copy.deepcopy(dataset_list_in[x]['samples'])  # temp copy of samples list

        # for every sample in samples list
        for y in range(len(temp_sample_list)):
            temp_row_dict = copy.deepcopy(dataset_list_in[x])  # copy over the entire original row
            temp_row_dict['samples'] = copy.deepcopy(temp_sample_list[y])  # over-write with an individual sample
            dataset_list_out.append(temp_row_dict)  # add the new dict to output list

    return dataset_list_out


### Unnest dictionaries

In [17]:
# Call unnest_dict_core and re-assemble dictionaries
def unnest_dict(dataset_list_in):
    dataset_list_out = []
    for x in range(len(dataset_list_in)):
        temp_dict = {}
        temp_dict = unnest_dict_core(dataset_list_in[x], "", True, temp_dict)
        dataset_list_out.append(temp_dict)
    return dataset_list_out


# unnest dictionaries recursively
def unnest_dict_core(nested_dict_in, prev_key, first_level, unnested_dict_out):
    keys = list(nested_dict_in.keys())  # a list of keys

    for key in keys:

        # set key names
        if first_level:
            key_name = str(key)
        else:
            key_name = prev_key + "." + str(key)

        # check type
        if type(nested_dict_in[str(key)]) == dict:
            unnest_dict_core(nested_dict_in[str(key)], key_name, False, unnested_dict_out)
        else:
            unnested_dict_out[key_name] = nested_dict_in[str(key)]
    return unnested_dict_out


### Unnest list

In [18]:
# A wrapper that unnnest lists by calling find_list
def unnest_list(dataset_list_in):
    dataset_list_out = copy.deepcopy(dataset_list_in)  # make a copy for output

    # for every row
    for x in range(len(dataset_list_in)):
        # print(x)
        dataset_list_out[x] = find_list(dataset_list_in[x])  # find any key whose value is a list

    return dataset_list_out


# Finds field that are dictionaries. Find column in the table that contain a list whose keys that would be identical
# when unnested and calls rename list. Look for key whose value is a list, call rename_list_content.
def find_list(dict_in):
    dict_out = copy.deepcopy(dict_in)  # make a copy of the row
    dict_keys = list(dict_in.keys())  # a list of keys

    # for every key in that row
    for dict_key in dict_keys:

        # find fields that are lists
        if type(dict_in[dict_key]) == list:
            # print(dict_key + " is a list")

            has_dict = True  # contains dictionaries

            # check if the list contains dictionaries
            for key_values in dict_in[dict_key]:
                if type(key_values) != dict:
                    has_dict = False

            # proceed when the list contains dictionaries
            if has_dict:
                # print(dict_key + " contains dictionaries")
                dict_out[dict_key] = rename_list_content(dict_in[dict_key])  # rename list content

    return dict_out


# Since the keys can have the same names in the dictionary, this function renames the items by appending a number at
# the end of each item. Having keys with the same name in a dictionary throws an error.
# rename dictionaries in the list such that they can be unnested in the same level and and keys can be unique.
def rename_list_content(same_keys_list):
    new_keys_dict_out = {}  # output

    # for every dictionary in the list
    for x in range(len(same_keys_list)):
        old_keys = list(same_keys_list[x].keys())  # original keys
        new_keys = copy.deepcopy(old_keys)  # keys to be modified

        # for every key
        for y in range(len(new_keys)):
            new_keys[y] += str(x + 1)  # modify the key with index

            # add the modified key and its value to a new dict
            new_keys_dict_out[str(new_keys[y])] = same_keys_list[x][old_keys[y]]

    return new_keys_dict_out


### Change 'samples.AD' to 't_ref_count' and 't_alt_count'

In [19]:
# Call change_name to change 'samples.AD' to 't_ref_count' and 't_alt_count', because they are required fields in MAf. 
def samples_AD(dataset_list_in):
    dataset_list_out = []  # output list

    # for every row
    for x in range(len(dataset_list_in)):
        dataset_list_out.append(change_name(dataset_list_in[x]))
    return dataset_list_out

# Change 'samples.AD' to 't_ref_count' and 't_alt_count'
def change_name(sample_dict_in):
    sample_dict_out = {}  # output dict
    keys = list(sample_dict_in.keys())
    for key in keys:

        # change 'samples.AD'
        if key == 'samples.AD':
            sample_dict_out['t_ref_count'] = sample_dict_in['samples.AD'][0]
            sample_dict_out['t_alt_count'] = sample_dict_in['samples.AD'][1]
        else:
            sample_dict_out[key] = sample_dict_in[key]  # copy over other fields
    return sample_dict_out


### Record Variant_Type

In [20]:
# Record variant type according to ref and alt alleles. Determines the variant type based on the 
# number of ref and alt alleles
def varType(dataset_list_in):
    for x in range(len(dataset_list_in)):
        ref = len(dataset_list_in[x]['ref'])
        alt = len(dataset_list_in[x]['alt'])
        if ref == alt == 1:
            dataset_list_in[x]['Variant_Type'] = 'SNP'
        if ref == alt == 2:
            dataset_list_in[x]['Variant_Type'] = 'DNP'
        if ref == alt == 3:
            dataset_list_in[x]['Variant_Type'] = 'TNP'
        if ref == alt and ref > 3:
            dataset_list_in[x]['Variant_Type'] = 'ONP'
        if ref < alt:
            dataset_list_in[x]['Variant_Type'] = 'INS'
        if ref > alt:
            dataset_list_in[x]['Variant_Type'] = 'DEL'

    return dataset_list_in 


### Unnest multiple values separated by ";" in a single field

In [21]:
# Fields with multiple values to separate: gene_knowngene, genedetail_knowngene, func_knowngene
# Takes in the whole table. Find fields with multiple values and attemp to separates those values and put each in a 
# new row. 
def unnest_semicolon_values(data_list_in):
    debug_genes = False
    debug_genes_and_func = False
    debug_genes_and_genedetails = False
    debug_genes_and_func_and_genedetails = False
    debug_lengths_check = False
    multiple_values_count = 0  # testing use

    data_list_out = []  # the output table to be returned

    # for every row in the whole table
    for row in data_list_in:
        if row['gene_knowngene'].find(';') is not -1:

            # testing use
            if debug_genes:
                print(row['gene_knowngene'])

            temp_rows_list = []  # holds row copies temporarily
            gene_knowngene_values = row['gene_knowngene'].split(';')

            # for every gene after splitting
            for gene in gene_knowngene_values:
                temp_row_dict = copy.deepcopy(row)  # copy the whole row
                temp_row_dict['gene_knowngene'] = gene  # replace with individual gene
                temp_rows_list.append(temp_row_dict)  # put rows in a list temporarily

            func_knowngene_values = row['func_knowngene'].split(';')

            # Match genes with corresponding funcs only when there are the same numbers of them
            # In the case of multiple genes with only 1 func, that 1 func is copied over to a new row 
            #    with every gene, so that case is always taken care of. 
            if len(func_knowngene_values) == len(gene_knowngene_values):
                # for every new row created after splitting genes
                for index in range(len(temp_rows_list)):
                    temp_rows_list[index]['func_knowngene'] = func_knowngene_values[
                        index]  # replace with individual func

                # testing use
                if debug_genes_and_func:
                    print(row['gene_knowngene'])
                    print(row['func_knowngene'] + "\n")
                    for x in temp_rows_list:
                        print(x['gene_knowngene'] + "  " + x['func_knowngene'])
                    print('\n')

            if 'genedetail_knowngene' in row.keys():
                genedetail_knowngene_values = row['genedetail_knowngene'].split(';')

                # Match genes with corresponding genedetails only when there are the same numbers of them
                # In the case of multiple genes with only 1 genedetail, that 1 genedetail is copied over to a new row 
                #    with every gene, so that case is always taken care of. 
                if len(genedetail_knowngene_values) == len(gene_knowngene_values):

                    # for every new row created after splitting genes
                    for index in range(len(temp_rows_list)):
                        temp_rows_list[index]['genedetail_knowngene'] = genedetail_knowngene_values[index]

                    # testing use
                    if debug_genes_and_genedetails:
                        print(row['gene_knowngene'])
                        print(row['func_knowngene'])
                        print(row['genedetail_knowngene'] + "\n")
                        for x in temp_rows_list:
                            print(x['gene_knowngene'] + "  " + x['func_knowngene'] + "  " + x['genedetail_knowngene'])
                        print('\n')

                    # testing use
                    if debug_genes_and_func_and_genedetails:
                        if len(genedetail_knowngene_values) == len(gene_knowngene_values) == len(func_knowngene_values):
                            print(row['gene_knowngene'])
                            print(row['func_knowngene'])
                            print(row['genedetail_knowngene'] + "\n")
                            for x in temp_rows_list:
                                print(
                                x['gene_knowngene'] + "  " + x['func_knowngene'] + "  " + x['genedetail_knowngene'])
                            print('\n')

            # testing use
            if debug_genes:
                for x in temp_rows_list:
                    print(x['gene_knowngene'] + "  " + x['func_knowngene'])

            # write the newly created rows to the output table
            for temp_row in temp_rows_list:
                data_list_out.append(temp_row)

                # testing use
                if debug_lengths_check:
                    multiple_values_count += 1

            # testing use
            if debug_lengths_check:
                multiple_values_count -= 1

        # write the row to the output table as it is
        else:
            data_list_out.append(copy.deepcopy(row))

    # testing use
    if debug_lengths_check:
        print("The lenght of data input: " + str(len(data_list_in)))
        print("The lenght of data output: " + str(len(data_list_out)))
        print("The difference in lengths: " + str(multiple_values_count))
        print("New rows created: " + str(len(data_list_in) - len(data_list_out)))

    return data_list_out


### Re-arrange and rename columns

In [22]:
# Re-arrange and rename columns to match the MAF format
def change_cols(df):
    cols = list(df)
    cols.insert(0, cols.pop(cols.index('gene_knowngene')))
    cols.insert(1, cols.pop(cols.index('chr')))
    cols.insert(2, cols.pop(cols.index('start')))
    cols.insert(3, cols.pop(cols.index('end')))
    cols.insert(4, cols.pop(cols.index('ref')))
    cols.insert(5, cols.pop(cols.index('alt')))
    cols.insert(6, cols.pop(cols.index('Variant_Type')))
    cols.insert(7, cols.pop(cols.index('func_knowngene')))
    cols.insert(8, cols.pop(cols.index('samples.sample_id')))
    cols.insert(9, cols.pop(cols.index('dbsnp.rsid')))
    cols.insert(10, cols.pop(cols.index('t_ref_count')))
    cols.insert(11, cols.pop(cols.index('t_alt_count')))
    # cols.insert(12, cols.pop(cols.index('aachange_knowngene')))

    df = df.loc[:, cols]

    df = df.rename(columns={'gene_knowngene': 'Hugo_Symbol',
                            'chr': 'Chromosome',
                            'start': 'Start_Position',
                            'end': 'End_Position',
                            'ref': 'Reference_Allele',
                            'alt': 'Tumor_Seq_Allele2',
                            'func_knowngene': 'Variant_Classification',
                            'samples.sample_id': 'Tumor_Sample_Barcode',
                            'cadd.1000g.afr': 'AFR_MAF',
                            'cadd.1000g.amr': 'AMR_MAF',
                            'cadd.1000g.asn': 'ASN_MAF',
                            'cadd.1000g.eur': 'EUR_MAF',
                            'dbsnp.rsid': 'dbSNP_RS'
                            # 'aachange_knowngene': 'Protein_Change'
                            })

    df['Variant_Classification'] = df['Variant_Classification'].replace({
        'intronic': 'Intron',
        'intergenic': 'IGR',
        'UTR3': "3'UTR",
        "UTR5": "5'UTR",
        'downstream': "3'Flank",
        'upstream': "5'Flank",
        'splicing': 'Splice_Site', 'ncRNA_exonic': 'RNA',
        'ncRNA_intronic': 'RNA', 'ncRNA_UTR3': 'RNA',
        'ncRNA_UTR5': 'RNA', 'ncRNA': 'RNA', })

    mask = df.exonicfunc_knowngene == 'nonsynonymous SNV'
    df.loc[mask, 'Variant_Classification'] = "Missense_Mutation"
    mask = df.exonicfunc_knowngene == 'synonymous SNV'
    df.loc[mask, 'Variant_Classification'] = "Silent"
    mask = df.exonicfunc_knowngene == 'stopgain'
    df.loc[mask, 'Variant_Classification'] = "Nonsense_Mutation"
    mask = df.exonicfunc_knowngene == 'stoploss'
    df.loc[mask, 'Variant_Classification'] = "Nonstop_Mutation"
    mask = df.exonicfunc_knowngene == 'frameshift insertion'
    df.loc[mask, 'Variant_Classification'] = "Frame_Shift_Ins"
    mask = df.exonicfunc_knowngene == 'frameshift deletion'
    df.loc[mask, 'Variant_Classification'] = "Frame_Shift_Del"
    mask = df.exonicfunc_knowngene == 'nonframeshift insertion'
    df.loc[mask, 'Variant_Classification'] = "In_Frame_Ins"
    mask = df.exonicfunc_knowngene == 'nonframeshift deletion'
    df.loc[mask, 'Variant_Classification'] = "In_Frame_Del"

    return df


### Maf formatter wrapper function

In [144]:
# Formats the output from VAPr such that the output matches MAF format, allowing for downstream processing and 
# analysis in Maftools A wrapper for the functions above
# Input: a VAPr output list
# Output: a formatted data frame ready to be saved as a MAF file
def maf_formatter(dataset_list_in):
    import pandas as pd
    import copy

    debug = False
    dataset_list_out = []  # the formatted list
    if debug == True:
        print(len(dataset_list_in))
    
    interval = 5   # the size of each subset of the input
    left_ptr = 0   # the left pointer of the subset
    right_ptr = left_ptr + interval  # the right pointer of the subset
    
    # Subsets the input iteratively, calls maf_formatter_core to format each subset, and then appends each 
    # subset to the formatted list for output
    while left_ptr < len(dataset_list_in):
        unformatted_subset_list = dataset_list_in[left_ptr:right_ptr]  # subsets the list
        
        if debug == True:
            print("Length of fragment: " + str(len(unformatted_subset_list)))
        
        formatted_subset_list = maf_formatter_core(unformatted_subset_list) # formats the subset
        
        # appends the items in the subset to the formatted list for output
        for item in formatted_subset_list:
            dataset_list_out.append(item)
            
        if debug == True:
            print("Output list current length: " + str(len(output_list)))
        
        # increment the pointers for the next subset
        left_ptr += interval
        right_ptr += interval
        
    if debug == True:
        print(len(dataset_list_out))

    df = pd.DataFrame(data=dataset_list_out)
    df2 = change_cols(df)

    return df2

def maf_formatter_core(dataset_list_in):
    k0 = copy.deepcopy(dataset_list_in)
    k1 = extract_samples(k0)
    k2 = unnest_dict(k1)
    k3 = unnest_list(k2)
    k4 = unnest_dict(k3)
    k5 = samples_AD(k4)
    k6 = varType(k5)
    k7 = unnest_semicolon_values(k6)
    
    return k7    

### Main

In [148]:
myDf = maf_formatter(known_disease_variants)
myDf

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,Variant_Type,Variant_Classification,Tumor_Sample_Barcode,dbSNP_RS,...,exonicfunc_knowngene,genedetail_knowngene,hgvs_id,samples.filter_passing_reads_count,samples.genotype,wellderly._license,wellderly.alleles.allele1,wellderly.alleles.allele2,wellderly.alleles.freq1,wellderly.alleles.freq2
0,PTCH1,9,98231215,98231215,C,T,SNP,Missense_Mutation,N40_vs_T40,rs750970743,...,nonsynonymous SNV,,chr9:g.98231215C>T,263,,,,,,
1,PPP1R26,9,138376649,138376649,T,C,SNP,Missense_Mutation,N39_vs_T39,rs3748192,...,nonsynonymous SNV,,chr9:g.138376649T>C,88,,https://goo.gl/e8OO17,C,T,0.2075,0.7925
2,NOTCH1,9,139396889,139396889,G,A,SNP,Missense_Mutation,N44_vs_T44,rs777962754,...,nonsynonymous SNV,,chr9:g.139396889G>A,102,,,,,,
3,NOTCH1,9,139412607,139412607,C,T,SNP,Missense_Mutation,N39_vs_T39,rs373770404,...,nonsynonymous SNV,,chr9:g.139412607C>T,71,,,,,,
4,NOTCH1,9,139413084,139413084,C,T,SNP,Missense_Mutation,N44_vs_T44,rs750215904,...,nonsynonymous SNV,,chr9:g.139413084C>T,106,,,,,,
5,AKAP17A,X,1719897,1719897,C,G,SNP,Missense_Mutation,N39_vs_T39,rs28661622,...,nonsynonymous SNV,,chrX:g.1719897C>G,118,,https://goo.gl/e8OO17,C,G,0.4025,0.5975
6,ASMT,X,1755404,1755404,C,T,SNP,Silent,N7_vs_T7,rs4933063,...,synonymous SNV,,chrX:g.1755404C>T,542,,https://goo.gl/e8OO17,C,T,0.9225,0.0775
7,HRAS,11,534285,534285,C,A,SNP,Missense_Mutation,N1_vs_T1,rs104894226,...,nonsynonymous SNV,,chr11:g.534285C>A,136,,,,,,
8,HRAS,11,534285,534285,C,A,SNP,Missense_Mutation,N17_vs_T17,rs104894226,...,nonsynonymous SNV,,chr11:g.534285C>A,146,,,,,,
9,HRAS,11,534285,534285,C,T,SNP,Missense_Mutation,N1_vs_T1,rs104894226,...,nonsynonymous SNV,,chr11:g.534285C>T,136,,,,,,


In [None]:
myDf.to_csv('rare_deleterious_variants.maf',  sep= "\t", index=False)

## Whole data collection

In [146]:
col = db[COLLECTION]
col_lect = col.find({})

wholeDataSet = []
for document in col_lect:
    wholeDataSet.append(document)
print(len(wholeDataSet))
wholeDataSet[823]

15630


{u'_id': ObjectId('5c06f3542c788e6b1d68bec2'),
 u'alt': u'G',
 u'chr': u'Y',
 u'end': 58983134,
 u'func_knowngene': u'intergenic',
 u'gene_knowngene': u'NONE;SPRY3',
 u'genedetail_knowngene': u'dist=NONE;dist=117323',
 u'hgvs_id': u'chrY:g.58983134A>G',
 u'notfound': True,
 u'ref': u'A',
 u'samples': [{u'AD': [38, 0],
   u'filter_passing_reads_count': 29,
   u'genotype': None,
   u'sample_id': u'N10_vs_T10'}],
 u'start': 58983134}

In [147]:
myDf = maf_formatter(wholeDataSet)
myDf

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,Variant_Type,Variant_Classification,Tumor_Sample_Barcode,dbSNP_RS,...,notfound,samples.filter_passing_reads_count,samples.genotype,wellderly._license,wellderly.alleles.allele1,wellderly.alleles.allele2,wellderly.alleles.allele3,wellderly.alleles.freq1,wellderly.alleles.freq2,wellderly.alleles.freq3
0,BC064148,9,69512439,69512439,A,C,SNP,IGR,N43_vs_T43,rs370901073,...,,3,,https://goo.gl/e8OO17,A,C,,0.8675,0.1325,
1,BC070322,9,69512439,69512439,A,C,SNP,IGR,N43_vs_T43,rs370901073,...,,3,,https://goo.gl/e8OO17,A,C,,0.8675,0.1325,
2,BC064148,9,69519038,69519038,G,T,SNP,IGR,N50_vs_T50,rs866556502,...,,10,,,,,,,,
3,BC070322,9,69519038,69519038,G,T,SNP,IGR,N50_vs_T50,rs866556502,...,,10,,,,,,,,
4,BC070322,9,69649186,69649186,A,G,SNP,RNA,N6_vs_T6,rs377224548,...,,4,,https://goo.gl/e8OO17,A,G,,0.8850,0.1150,
5,AK310876,9,69772113,69772113,C,A,SNP,IGR,N33_vs_T33,rs4269641,...,,6,,,,,,,,
6,FOXD4L5,9,69772113,69772113,C,A,SNP,IGR,N33_vs_T33,rs4269641,...,,6,,,,,,,,
7,AK310876,9,69798394,69798394,C,T,SNP,IGR,N47_vs_T47,rs62563631,...,,5,,,,,,,,
8,FOXD4L5,9,69798394,69798394,C,T,SNP,IGR,N47_vs_T47,rs62563631,...,,5,,,,,,,,
9,AK310876,9,69798414,69798414,G,A,SNP,IGR,N8_vs_T8,rs199650048,...,,5,,,,,,,,


In [92]:
cols = list(df2)
for x in cols:
    print(x)

Hugo_Symbol
Chromosome
Start_Position
End_Position
Reference_Allele
Tumor_Seq_Allele2
Variant_Type
Variant_Classification
Tumor_Sample_Barcode
dbSNP_RS
t_ref_count
t_alt_count
1000g2015aug_all
_id
cadd.1000g.af
AFR_MAF
AMR_MAF
ASN_MAF
EUR_MAF
cadd._license
cadd.esp.af
cadd.esp.afr
cadd.esp.eur
cadd.gerp.n
cadd.gerp.rs
cadd.gerp.rs_pval
cadd.gerp.s
cadd.phred
cadd.polyphen.cat
cadd.polyphen.cat1
cadd.polyphen.cat2
cadd.polyphen.val
cadd.polyphen.val1
cadd.polyphen.val2
cadd.sift.cat
cadd.sift.val
cgi._license
cgi.association
cgi.cdna
cgi.drug
cgi.evidence_level
cgi.gene
cgi.primary_tumor_type
cgi.protein_change
cgi.region
cgi.source
cgi.transcript
civic._license
civic.description
civic.evidence_items.clinical_significance
civic.evidence_items.clinical_significance1
civic.evidence_items.clinical_significance10
civic.evidence_items.clinical_significance11
civic.evidence_items.clinical_significance12
civic.evidence_items.clinical_significance13
civic.evidence_items.clinical_significance14


In [217]:
myDf.to_csv('all_variants.maf',  sep= "\t", index=False)

# Debugging/Testing

### Original format

In [239]:
df = pd.DataFrame(data=rare_deleterious_variants)
df

Unnamed: 0,1000g2015aug_all,_id,alt,cadd,chr,clinvar,cosmic,dbsnp,end,exonicfunc_knowngene,func_knowngene,gene_knowngene,hgvs_id,ref,samples,start,wellderly
0,0.022764,5b7222622c788e54da746c5e,T,"{'1000g': {'af': 0.04, 'afr': 0.002, 'amr': 0....",9,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",138377498,nonsynonymous SNV,exonic,PPP1R26,chr9:g.138377498G>T,G,"[{'sample_id': 'N7_vs_T7', 'genotype': None, '...",138377498,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
1,0.001398,5b7222622c788e54da746c79,T,"{'1000g': {'af': 0.001, 'amr': 0.01}, '_licens...",9,"{'_license': 'https://goo.gl/OaHML9', 'rcv': [...",,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",139405649,nonsynonymous SNV,exonic,NOTCH1,chr9:g.139405649C>T,C,"[{'sample_id': 'N7_vs_T7', 'genotype': None, '...",139405649,
2,0.005391,5b7222632c788e54d4747089,T,"{'1000g': {'af': 0.01, 'afr': 0.004, 'amr': 0....",1,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",152280110,nonsynonymous SNV,exonic,FLG,chr1:g.152280110C>T,C,"[{'sample_id': 'N39_vs_T39', 'genotype': None,...",152280110,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
3,0.002396,5b7222632c788e54d47470a0,T,"{'1000g': {'af': 0.003, 'afr': 0.01}, '_licens...",1,,"{'_license': 'https://goo.gl/2tibWa', 'cosmic_...","{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",152281795,nonsynonymous SNV,exonic,FLG,chr1:g.152281795C>T,C,"[{'sample_id': 'N9_vs_T9', 'genotype': None, '...",152281795,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
4,0.015974,5b7222632c788e54d5746cd0,A,"{'1000g': {'af': 0.03, 'afr': 0.004, 'amr': 0....",16,"{'_license': 'https://goo.gl/OaHML9', 'rcv': [...",,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",89838086,nonsynonymous SNV,exonic,FANCA,chr16:g.89838086C>A,C,"[{'sample_id': 'N39_vs_T39', 'genotype': None,...",89838086,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
5,0.041933,5b7222632c788e54d5746d03,T,"{'1000g': {'af': 0.05, 'afr': 0.02, 'amr': 0.0...",16,"{'_license': 'https://goo.gl/OaHML9', 'rcv': [...",,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",89883007,nonsynonymous SNV,exonic,FANCA,chr16:g.89883007A>T,A,"[{'sample_id': 'N39_vs_T39', 'genotype': None,...",89883007,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
6,0.0002,5b7222632c788e54d5747094,A,"{'1000g': {'af': 0.001, 'eur': 0.001}, '_licen...",19,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",1036460,nonsynonymous SNV,exonic,CNN2,chr19:g.1036460G>A,G,"[{'sample_id': 'N44_vs_T44', 'genotype': None,...",1036460,
7,0.008387,5b7222632c788e54d3746eda,A,"{'1000g': {'af': 0.01, 'afr': 0.01, 'amr': 0.0...",14,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",24803710,nonsynonymous SNV,exonic,ADCY4,chr14:g.24803710G>A,G,"[{'sample_id': 'N7_vs_T7', 'genotype': None, '...",24803710,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."
8,0.0002,5b7222642c788e54d8746d0e,A,"{'_license': 'http://goo.gl/bkpNhq', 'gerp': {...",4,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",187627812,nonsynonymous SNV,exonic,FAT1,chr4:g.187627812G>A,G,"[{'sample_id': 'N44_vs_T44', 'genotype': None,...",187627812,
9,0.021566,5b7222642c788e54d8746fdf,T,"{'1000g': {'af': 0.03, 'afr': 0.01, 'amr': 0.0...",6,,,"{'_license': 'https://goo.gl/Ztr5rl', 'rsid': ...",27834677,nonsynonymous SNV,exonic,HIST1H1B,chr6:g.27834677C>T,C,"[{'sample_id': 'N39_vs_T39', 'genotype': None,...",27834677,"{'_license': 'https://goo.gl/e8OO17', 'alleles..."


In [12]:
df_k = pd.DataFrame(data=known_disease_variants)
for x in range(len(df_k.samples)):
    print(df_k.samples[x])

[{'sample_id': 'N40_vs_T40', 'genotype': None, 'filter_passing_reads_count': 263, 'AD': [263, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 88, 'AD': [88, 0]}]
[{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 102, 'AD': [102, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 71, 'AD': [71, 0]}]
[{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 106, 'AD': [106, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 118, 'AD': [118, 0]}]
[{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 542, 'AD': [542, 0]}]
[{'sample_id': 'N17_vs_T17', 'genotype': None, 'filter_passing_reads_count': 146, 'AD': [146, 0]}, {'sample_id': 'N1_vs_T1', 'genotype': None, 'filter_passing_reads_count': 136, 'AD': [136, 0]}]
[{'sample_id': 'N17_vs_T17', 'genotype': None, 'filter_passing_reads_count': 146, 'AD': [146, 0]}, {'sample_id': 'N1_v

### Extract samples

In [14]:
myList = extract_samples(rare_deleterious_variants)

In [15]:
for x in range(len(rare_deleterious_variants)):
    print(rare_deleterious_variants[x]['samples'])

[{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 75, 'AD': [75, 0]}]
[{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 276, 'AD': [276, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 322, 'AD': [398, 0]}]
[{'sample_id': 'N9_vs_T9', 'genotype': None, 'filter_passing_reads_count': 410, 'AD': [409, 1]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 337, 'AD': [337, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 25, 'AD': [25, 0]}]
[{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 36, 'AD': [36, 0]}]
[{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 292, 'AD': [292, 0]}]
[{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 242, 'AD': [242, 0]}]
[{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 138, 'AD': [138, 0]}]
[{'sample_id': 'N7_vs_T7

In [16]:
for x in range(len(myList)):
    print(myList[x]['samples'])

{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 75, 'AD': [75, 0]}
{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 276, 'AD': [276, 0]}
{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 322, 'AD': [398, 0]}
{'sample_id': 'N9_vs_T9', 'genotype': None, 'filter_passing_reads_count': 410, 'AD': [409, 1]}
{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 337, 'AD': [337, 0]}
{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 25, 'AD': [25, 0]}
{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 36, 'AD': [36, 0]}
{'sample_id': 'N7_vs_T7', 'genotype': None, 'filter_passing_reads_count': 292, 'AD': [292, 0]}
{'sample_id': 'N44_vs_T44', 'genotype': None, 'filter_passing_reads_count': 242, 'AD': [242, 0]}
{'sample_id': 'N39_vs_T39', 'genotype': None, 'filter_passing_reads_count': 138, 'AD': [138, 0]}
{'sample_id': 'N7_vs_T7', 'genotype': None, 

### Annotation keys

In [17]:
a = list(rare_deleterious_variants[0].keys())
for key in a:
    print(key)

_id
chr
start
end
ref
alt
func_knowngene
gene_knowngene
exonicfunc_knowngene
1000g2015aug_all
hgvs_id
samples
cadd
dbsnp
wellderly


In [18]:
print(type(rare_deleterious_variants[14]['wellderly']['alleles']))
print(rare_deleterious_variants[14]['wellderly']['alleles'])

<class 'list'>
[{'allele': 'C', 'freq': 0.0325}, {'allele': 'T', 'freq': 0.9675}]


### Unnest dictionaries

In [20]:
myList2 = unnest_dict(myList)
df2 = pd.DataFrame(data=myList2)
df2

Unnamed: 0,1000g2015aug_all,_id,alt,cadd.1000g.af,cadd.1000g.afr,cadd.1000g.amr,cadd.1000g.asn,cadd.1000g.eur,cadd._license,cadd.esp.af,...,gene_knowngene,hgvs_id,ref,samples.AD,samples.filter_passing_reads_count,samples.genotype,samples.sample_id,start,wellderly._license,wellderly.alleles
0,0.022764,5b7222622c788e54da746c5e,T,0.04,0.002,0.05,,0.08,http://goo.gl/bkpNhq,0.048,...,PPP1R26,chr9:g.138377498G>T,G,"[75, 0]",75,,N7_vs_T7,138377498,https://goo.gl/e8OO17,"[{'allele': 'T', 'freq': 0.0575}, {'allele': '..."
1,0.001398,5b7222622c788e54da746c79,T,0.001,,0.01,,,http://goo.gl/bkpNhq,0.003,...,NOTCH1,chr9:g.139405649C>T,C,"[276, 0]",276,,N7_vs_T7,139405649,,
2,0.005391,5b7222632c788e54d4747089,T,0.01,0.004,0.01,,0.02,http://goo.gl/bkpNhq,0.014,...,FLG,chr1:g.152280110C>T,C,"[398, 0]",322,,N39_vs_T39,152280110,https://goo.gl/e8OO17,"[{'allele': 'C', 'freq': 0.98}, {'allele': 'T'..."
3,0.002396,5b7222632c788e54d47470a0,T,0.003,0.01,,,,http://goo.gl/bkpNhq,0.002,...,FLG,chr1:g.152281795C>T,C,"[409, 1]",410,,N9_vs_T9,152281795,https://goo.gl/e8OO17,"[{'allele': 'C', 'freq': 0.9975}, {'allele': '..."
4,0.015974,5b7222632c788e54d5746cd0,A,0.03,0.004,0.07,,0.04,http://goo.gl/bkpNhq,0.022,...,FANCA,chr16:g.89838086C>A,C,"[337, 0]",337,,N39_vs_T39,89838086,https://goo.gl/e8OO17,"[{'allele': 'A', 'freq': 0.03}, {'allele': 'C'..."
5,0.041933,5b7222632c788e54d5746d03,T,0.05,0.02,0.06,0.004,0.1,http://goo.gl/bkpNhq,0.05,...,FANCA,chr16:g.89883007A>T,A,"[25, 0]",25,,N39_vs_T39,89883007,https://goo.gl/e8OO17,"[{'allele': 'A', 'freq': 0.9175}, {'allele': '..."
6,0.0002,5b7222632c788e54d5747094,A,0.001,,,,0.001,http://goo.gl/bkpNhq,,...,CNN2,chr19:g.1036460G>A,G,"[36, 0]",36,,N44_vs_T44,1036460,,
7,0.008387,5b7222632c788e54d3746eda,A,0.01,0.01,0.01,,0.03,http://goo.gl/bkpNhq,0.015,...,ADCY4,chr14:g.24803710G>A,G,"[292, 0]",292,,N7_vs_T7,24803710,https://goo.gl/e8OO17,"[{'allele': 'A', 'freq': 0.025}, {'allele': 'G..."
8,0.0002,5b7222642c788e54d8746d0e,A,,,,,,http://goo.gl/bkpNhq,,...,FAT1,chr4:g.187627812G>A,G,"[242, 0]",242,,N44_vs_T44,187627812,,
9,0.021566,5b7222642c788e54d8746fdf,T,0.03,0.01,0.02,0.002,0.06,http://goo.gl/bkpNhq,0.039,...,HIST1H1B,chr6:g.27834677C>T,C,"[138, 0]",138,,N39_vs_T39,27834677,https://goo.gl/e8OO17,"[{'allele': 'C', 'freq': 0.94}, {'allele': 'T'..."


In [21]:
myList2[5]

{'_id': ObjectId('5b7222632c788e54d5746d03'),
 'chr': '16',
 'start': 89883007,
 'end': 89883007,
 'ref': 'A',
 'alt': 'T',
 'func_knowngene': 'exonic',
 'gene_knowngene': 'FANCA',
 'exonicfunc_knowngene': 'nonsynonymous SNV',
 '1000g2015aug_all': 0.0419329,
 'hgvs_id': 'chr16:g.89883007A>T',
 'samples.sample_id': 'N39_vs_T39',
 'samples.genotype': None,
 'samples.filter_passing_reads_count': 25,
 'samples.AD': [25, 0],
 'cadd.1000g.af': 0.05,
 'cadd.1000g.afr': 0.02,
 'cadd.1000g.amr': 0.06,
 'cadd.1000g.asn': 0.004,
 'cadd.1000g.eur': 0.1,
 'cadd._license': 'http://goo.gl/bkpNhq',
 'cadd.esp.af': 0.05,
 'cadd.esp.afr': 0.014,
 'cadd.esp.eur': 0.066,
 'cadd.gerp.n': 4.49,
 'cadd.gerp.s': -2.07,
 'cadd.phred': 17.1,
 'cadd.polyphen.cat': 'possibly_damaging',
 'cadd.polyphen.val': 0.485,
 'cadd.sift.cat': 'tolerated',
 'cadd.sift.val': 0.07,
 'clinvar._license': 'https://goo.gl/OaHML9',
 'clinvar.rcv': [{'accession': 'RCV000312371',
   'clinical_significance': 'Likely benign',
   'condi

### Unnest list

In [23]:
myList3 = unnest_list(myList2)

In [24]:
myList3[4]

{'_id': ObjectId('5b7222632c788e54d5746cd0'),
 'chr': '16',
 'start': 89838086,
 'end': 89838086,
 'ref': 'C',
 'alt': 'A',
 'func_knowngene': 'exonic',
 'gene_knowngene': 'FANCA',
 'exonicfunc_knowngene': 'nonsynonymous SNV',
 '1000g2015aug_all': 0.0159744,
 'hgvs_id': 'chr16:g.89838086C>A',
 'samples.sample_id': 'N39_vs_T39',
 'samples.genotype': None,
 'samples.filter_passing_reads_count': 337,
 'samples.AD': [337, 0],
 'cadd.1000g.af': 0.03,
 'cadd.1000g.afr': 0.004,
 'cadd.1000g.amr': 0.07,
 'cadd.1000g.eur': 0.04,
 'cadd._license': 'http://goo.gl/bkpNhq',
 'cadd.esp.af': 0.022,
 'cadd.esp.afr': 0.005,
 'cadd.esp.eur': 0.031,
 'cadd.gerp.n': 4.45,
 'cadd.gerp.s': 3.48,
 'cadd.phred': 15.03,
 'cadd.polyphen.cat': 'benign',
 'cadd.polyphen.val': 0.019,
 'cadd.sift.cat': 'tolerated',
 'cadd.sift.val': 0.13,
 'clinvar._license': 'https://goo.gl/OaHML9',
 'clinvar.rcv': {'accession1': 'RCV000312324',
  'clinical_significance1': 'Benign/Likely benign',
  'conditions1': {'identifiers': {

In [25]:
myList4 = unnest_dict(myList3)

### Chagne 'samples.AD' to 't_ref_count' and 't_alt_count'

In [27]:
myList5 = samples_AD(myList4)
myList5[6]

{'_id': ObjectId('5b7222632c788e54d5747094'),
 'chr': '19',
 'start': 1036460,
 'end': 1036460,
 'ref': 'G',
 'alt': 'A',
 'func_knowngene': 'exonic',
 'gene_knowngene': 'CNN2',
 'exonicfunc_knowngene': 'nonsynonymous SNV',
 '1000g2015aug_all': 0.000199681,
 'hgvs_id': 'chr19:g.1036460G>A',
 'samples.sample_id': 'N44_vs_T44',
 'samples.genotype': None,
 'samples.filter_passing_reads_count': 36,
 't_ref_count': 36,
 't_alt_count': 0,
 'cadd.1000g.af': 0.001,
 'cadd.1000g.eur': 0.001,
 'cadd._license': 'http://goo.gl/bkpNhq',
 'cadd.gerp.n': 4.18,
 'cadd.gerp.rs': 371.9,
 'cadd.gerp.rs_pval': 8.69202e-66,
 'cadd.gerp.s': 4.18,
 'cadd.phred': 24.0,
 'cadd.polyphen.cat': 'possibly_damaging',
 'cadd.polyphen.val': 0.723,
 'cadd.sift.cat': 'tolerated',
 'cadd.sift.val': 0.06,
 'dbsnp._license': 'https://goo.gl/Ztr5rl',
 'dbsnp.rsid': 'rs148071782'}

### Record Variant_Type

In [179]:
myList6 = varType(myList5)

### Unnest multiple values separated by ";" in a single field

In [221]:
myList7 = unnest_semicolon_values(myList6)
df7 = pd.DataFrame(myList7)
df7

Unnamed: 0,1000g2015aug_all,Variant_Type,_id,alt,cadd.1000g.af,cadd.1000g.afr,cadd.1000g.amr,cadd.1000g.asn,cadd.1000g.eur,cadd._license,...,samples.genotype,samples.sample_id,start,t_alt_count,t_ref_count,wellderly._license,wellderly.alleles.allele1,wellderly.alleles.allele2,wellderly.alleles.freq1,wellderly.alleles.freq2
0,0.022764,SNP,5b7222622c788e54da746c5e,T,0.04,0.002,0.05,,0.08,http://goo.gl/bkpNhq,...,,N7_vs_T7,138377498,0,75,https://goo.gl/e8OO17,T,G,0.0575,0.9425
1,0.001398,SNP,5b7222622c788e54da746c79,T,0.001,,0.01,,,http://goo.gl/bkpNhq,...,,N7_vs_T7,139405649,0,276,,,,,
2,0.005391,SNP,5b7222632c788e54d4747089,T,0.01,0.004,0.01,,0.02,http://goo.gl/bkpNhq,...,,N39_vs_T39,152280110,0,398,https://goo.gl/e8OO17,C,T,0.98,0.02
3,0.002396,SNP,5b7222632c788e54d47470a0,T,0.003,0.01,,,,http://goo.gl/bkpNhq,...,,N9_vs_T9,152281795,1,409,https://goo.gl/e8OO17,C,T,0.9975,0.0025
4,0.015974,SNP,5b7222632c788e54d5746cd0,A,0.03,0.004,0.07,,0.04,http://goo.gl/bkpNhq,...,,N39_vs_T39,89838086,0,337,https://goo.gl/e8OO17,A,C,0.03,0.97
5,0.041933,SNP,5b7222632c788e54d5746d03,T,0.05,0.02,0.06,0.004,0.1,http://goo.gl/bkpNhq,...,,N39_vs_T39,89883007,0,25,https://goo.gl/e8OO17,A,T,0.9175,0.0825
6,0.0002,SNP,5b7222632c788e54d5747094,A,0.001,,,,0.001,http://goo.gl/bkpNhq,...,,N44_vs_T44,1036460,0,36,,,,,
7,0.008387,SNP,5b7222632c788e54d3746eda,A,0.01,0.01,0.01,,0.03,http://goo.gl/bkpNhq,...,,N7_vs_T7,24803710,0,292,https://goo.gl/e8OO17,A,G,0.025,0.975
8,0.0002,SNP,5b7222642c788e54d8746d0e,A,,,,,,http://goo.gl/bkpNhq,...,,N44_vs_T44,187627812,0,242,,,,,
9,0.021566,SNP,5b7222642c788e54d8746fdf,T,0.03,0.01,0.02,0.002,0.06,http://goo.gl/bkpNhq,...,,N39_vs_T39,27834677,0,138,https://goo.gl/e8OO17,C,T,0.94,0.06


In [None]:
for index, row in df7.iterrows():
    if row['Variant_Classification'] == 'exonic':
        print(row['exonicfunc_knowngene'])

In [161]:
for key in k6[1].keys():
    print(key + ": ")
    print(k6[1][key])

_id: 
5b7222622c788e54da746b54
chr: 
9
start: 
69519038
end: 
69519038
ref: 
G
alt: 
T
func_knowngene: 
intergenic
gene_knowngene: 
BC064148;BC070322
genedetail_knowngene: 
dist=37363;dist=114311
hgvs_id: 
chr9:g.69519038G>T
samples.sample_id: 
N50_vs_T50
samples.genotype: 
None
samples.filter_passing_reads_count: 
10
t_ref_count: 
14
t_alt_count: 
0
dbsnp._license: 
https://goo.gl/Ztr5rl
dbsnp.rsid: 
rs866556502
Variant_Type: 
SNP


### Re-arrange and rename columns

In [82]:
df4 = pd.DataFrame(data=myList6)
df4

Unnamed: 0,1000g2015aug_all,Variant_Type,_id,alt,cadd.1000g.af,cadd.1000g.afr,cadd.1000g.amr,cadd.1000g.asn,cadd.1000g.eur,cadd._license,...,samples.genotype,samples.sample_id,start,t_alt_count,t_ref_count,wellderly._license,wellderly.alleles.allele1,wellderly.alleles.allele2,wellderly.alleles.freq1,wellderly.alleles.freq2
0,0.022764,SNP,5b7222622c788e54da746c5e,T,0.04,0.002,0.05,,0.08,http://goo.gl/bkpNhq,...,,N7_vs_T7,138377498,0,75,https://goo.gl/e8OO17,T,G,0.0575,0.9425
1,0.001398,SNP,5b7222622c788e54da746c79,T,0.001,,0.01,,,http://goo.gl/bkpNhq,...,,N7_vs_T7,139405649,0,276,,,,,
2,0.005391,SNP,5b7222632c788e54d4747089,T,0.01,0.004,0.01,,0.02,http://goo.gl/bkpNhq,...,,N39_vs_T39,152280110,0,398,https://goo.gl/e8OO17,C,T,0.98,0.02
3,0.002396,SNP,5b7222632c788e54d47470a0,T,0.003,0.01,,,,http://goo.gl/bkpNhq,...,,N9_vs_T9,152281795,1,409,https://goo.gl/e8OO17,C,T,0.9975,0.0025
4,0.015974,SNP,5b7222632c788e54d5746cd0,A,0.03,0.004,0.07,,0.04,http://goo.gl/bkpNhq,...,,N39_vs_T39,89838086,0,337,https://goo.gl/e8OO17,A,C,0.03,0.97
5,0.041933,SNP,5b7222632c788e54d5746d03,T,0.05,0.02,0.06,0.004,0.1,http://goo.gl/bkpNhq,...,,N39_vs_T39,89883007,0,25,https://goo.gl/e8OO17,A,T,0.9175,0.0825
6,0.0002,SNP,5b7222632c788e54d5747094,A,0.001,,,,0.001,http://goo.gl/bkpNhq,...,,N44_vs_T44,1036460,0,36,,,,,
7,0.008387,SNP,5b7222632c788e54d3746eda,A,0.01,0.01,0.01,,0.03,http://goo.gl/bkpNhq,...,,N7_vs_T7,24803710,0,292,https://goo.gl/e8OO17,A,G,0.025,0.975
8,0.0002,SNP,5b7222642c788e54d8746d0e,A,,,,,,http://goo.gl/bkpNhq,...,,N44_vs_T44,187627812,0,242,,,,,
9,0.021566,SNP,5b7222642c788e54d8746fdf,T,0.03,0.01,0.02,0.002,0.06,http://goo.gl/bkpNhq,...,,N39_vs_T39,27834677,0,138,https://goo.gl/e8OO17,C,T,0.94,0.06


In [84]:
df4 = change_cols(df4)
df4

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,Variant_Type,Variant_Classification,Tumor_Sample_Barcode,dbSNP_RS,...,dbsnp._license,exonicfunc_knowngene,hgvs_id,samples.filter_passing_reads_count,samples.genotype,wellderly._license,wellderly.alleles.allele1,wellderly.alleles.allele2,wellderly.alleles.freq1,wellderly.alleles.freq2
0,PPP1R26,9,138377498,138377498,G,T,SNP,Missense_Mutation,N7_vs_T7,rs34151777,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr9:g.138377498G>T,75,,https://goo.gl/e8OO17,T,G,0.0575,0.9425
1,NOTCH1,9,139405649,139405649,C,T,SNP,Missense_Mutation,N7_vs_T7,rs35136134,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr9:g.139405649C>T,276,,,,,,
2,FLG,1,152280110,152280110,C,T,SNP,Missense_Mutation,N39_vs_T39,rs138488969,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr1:g.152280110C>T,322,,https://goo.gl/e8OO17,C,T,0.98,0.02
3,FLG,1,152281795,152281795,C,T,SNP,Missense_Mutation,N9_vs_T9,rs111360507,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr1:g.152281795C>T,410,,https://goo.gl/e8OO17,C,T,0.9975,0.0025
4,FANCA,16,89838086,89838086,C,A,SNP,Missense_Mutation,N39_vs_T39,rs1131660,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr16:g.89838086C>A,337,,https://goo.gl/e8OO17,A,C,0.03,0.97
5,FANCA,16,89883007,89883007,A,T,SNP,Missense_Mutation,N39_vs_T39,rs1800282,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr16:g.89883007A>T,25,,https://goo.gl/e8OO17,A,T,0.9175,0.0825
6,CNN2,19,1036460,1036460,G,A,SNP,Missense_Mutation,N44_vs_T44,rs148071782,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr19:g.1036460G>A,36,,,,,,
7,ADCY4,14,24803710,24803710,G,A,SNP,Missense_Mutation,N7_vs_T7,rs77202343,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr14:g.24803710G>A,292,,https://goo.gl/e8OO17,A,G,0.025,0.975
8,FAT1,4,187627812,187627812,G,A,SNP,Missense_Mutation,N44_vs_T44,rs543528838,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr4:g.187627812G>A,242,,,,,,
9,HIST1H1B,6,27834677,27834677,C,T,SNP,Missense_Mutation,N39_vs_T39,rs34144478,...,https://goo.gl/Ztr5rl,nonsynonymous SNV,chr6:g.27834677C>T,138,,https://goo.gl/e8OO17,C,T,0.94,0.06


In [85]:
df4.to_csv('rare_deleterious_variants.maf',  sep= "\t", index=False)

### Format the input using subsets iteratively

In [None]:
txt1 = str(maf_formatter(wholeDataSet)) # using list as output instead of data frame
txt2 = str(maf_formatter_core(wholeDataSet))
txt1 == txt2

# Citation

Birmingham, A., Mark, A. M., Mazzaferro, C., Xu, G., & Fisch, K. M. (2018). Efficient population-scale variant analysis and prioritization with VAPr. Bioinformatics (Oxford, England), 34(16), 2843–2845. doi:10.1093/bioinformatics/bty192