In [1]:
import numpy as np
import pandas as pd

In [2]:
def gene_annotate(filename, ref_file):
    '''
    This function will create a dataframe from a vcf file and add a column with the genes associated with that variant.
    

    This function takes a filename and reference file name location as a string. The filename corresponds to gene variations
    in a cancer subtype and the reference file name corresponds to genes and their positions in the human genome. This
    function will read both files into the program as dataframes, eliminating irrelevant data. The function will then
    iterate through the filename dataframe to determine genes associated with each variant (referencing the gene information
    in the ref_file dataframe) and attach the genes associated as a new column to the dataframe.
    
    Parameters: 
    
    ref_file (string): This argument is a (.vcf) file location as a string corresponding to gene variations present
                       in a cancer subset.
    
    
    filename (string): This argument is (.txt) file location as a string corresponding to genes in the
                       human genome.


    Returns: 
    data_frame (DataFrame): The output is a DataFrame consisting of each variant from the original (.vcf) file with the
                            'CHROM', 'POS', 'REF', 'ALT' columns, as well as, a new 'GENE_SYMBOLS' column corresponding
                            to the genes asssociated with that genetic variant.
                            
   
    
    '''
    data_frame=pd.read_csv(filename, header=None, sep='\t',index_col=None, comment='#', names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','GROUP_C'],usecols=['CHROM','POS','REF','ALT'])
    genesymbol_df = pd.read_csv(ref_file, header=0, sep='\t',index_col=None)
    master_gene_list = []
    for index, row in data_frame.iterrows(): 
        position = row ["POS"]
        gene_df_subset = genesymbol_df.loc[(genesymbol_df['START'] <= position) & (genesymbol_df['STOP'] >= position)]
        row_genes = gene_df_subset["GENESYMBOL"].tolist()
        row_genes = list(set(row_genes))
        master_gene_list.append(row_genes)
    data_frame['GENE_SYMBOL'] = master_gene_list
    data_frame['GENE_SYMBOL'] = data_frame['GENE_SYMBOL'].apply(lambda x: np.nan if len(x) == 0 else tuple(x))
    return data_frame

In [3]:
def common_variants(dataframe1, dataframe2, dataframe3):
    
    '''
    This function will create a dataframe of gene variants from three separate dataframes (created from (.vcf) files of a 
    specific cancer subtype) for variants that are present in all three cancer subtypes.
    

    This function takes three dataframes, each corresponding to a (.vcf) file for gene variants in a specific cancer subtype,
    and first merges the first and second dataframes, then the merged dataframe is merged with the third dataframe. Since
    pd.merge is used with how='inner', when this occurs only the intersection of the dataframes is kept in the new dataframe,
    therefore, returning only the common variants among the three cancer subtypes
    Parameters: 
    
    dataframe1 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subset.
    
    
    dataframe2 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subset.
                            
    dataframe3 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subset.


    Returns: 
    variants (DataFrame): The output is a DataFrame consisting of every gene variant common among the three input cancer
                          subsets.
                            
   
    
    '''
    variants1=pd.merge(dataframe1, dataframe2, on=None, how='inner')
    variants=pd.merge(variants1,dataframe3, on=None, how='inner')
    return variants

In [12]:
def specific_variants(dataframe1, dataframe2, dataframe3):
    
    '''
    This function will create a dataframe of gene variants from three separate dataframes (created from (.vcf) files of a 
    specific cancer subtype) for variants that are ONLY present in the cancer subtype corresponding to the first Dataframe,
    dataframe1, and NOT present in the second or third cancer subset.
    

    This function takes three dataframes, each corresponding to a (.vcf) file for gene variants in a specific cancer subtype,
    and first merges the first and second dataframes, then merges the first and third dataframes. Since pd.merge is used with
    how='inner', when this occurs only the intersection of the dataframes is kept in the new dataframe, therefore, returning
    only the common variants among the two cancer subtypes merged. This is done to determine the commonalities between the
    first cancer subtype (the one of interest) and the other two cancer subtypes. The two merged dataframes (looking for 
    commonalities between the first and second and first and third cancer subtypes) is then concatenated, and
    pd.Dataframe.drop_duplicates is used to eliminate any variants that are present in all three dataframes, leaving only the
    gene variants related to the first cancer subset (dataframe1).
    Parameters: 
    
    dataframe1 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subset. This should be the cancer subset of which you want to determine
                            gene variants related to this cancer subtype ONLY and NOT the other two cancer subtypes.
    
    
    dataframe2 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subtype.
                            
    dataframe3 (DataFrame): This argument is a DataFrame, corresponding to a (.vcf) file of gene variants present
                            in a cancer subtype.


    Returns: 
    variants (DataFrame): The output is a DataFrame consisting of every gene variant specific to ONLY the first input
                          cancer subtype (dataframe1).
                            
   
    
    '''
    common1 =pd.merge(dataframe1,dataframe2, on=None, how='inner')
    common2= pd.merge(dataframe1,dataframe3, on=None, how='inner')
    specific_variants=pd.concat([dataframe1,common1,common2]).drop_duplicates(keep=False, inplace=False)
    return specific_variants

***THIS CELL WILL PRINT OUT THE VARIANTS COMMON AMONG ALL THREE CANCER SUBTYPES AS variants_popular, AND THE VARIANTS SPECIFIC TO ONLY TRIPLE NEGATIVE BREAST CANCER***

In [13]:
TNBC = gene_annotate('TNBC.vcf','hg19_genes_chr1p_bed.txt')
nTNBC = gene_annotate('nTNBC.vcf','hg19_genes_chr1p_bed.txt')
HER2 = gene_annotate('HER2.vcf','hg19_genes_chr1p_bed.txt')

variants_popular = common_variants(TNBC,nTNBC,HER2)

CancerSubtype_ofInterest = specific_variants(TNBC,nTNBC,HER2)

print(variants_popular, CancerSubtype_ofInterest)

    CHROM       POS REF ALT  GENE_SYMBOL
0       1    565286   C   T          NaN
1       1    879676   G   A    (SAMD11,)
2       1    879687   T   C    (SAMD11,)
3       1    881627   G   A          NaN
4       1    887801   A   G          NaN
..    ...       ...  ..  ..          ...
64      1  12072518   G   C      (MFN2,)
65      1  12267292   C   T  (TNFRSF1B,)
66      1  12401868   A   C    (VPS13D,)
67      1  13943724   T   C      (PDPN,)
68      1  14116491   C   G     (PRDM2,)

[69 rows x 5 columns]      CHROM       POS    REF  ALT GENE_SYMBOL
0        1    237763      G    A         NaN
4        1    567242      G    A         NaN
5        1    723801  AGAGA  AGA         NaN
6        1    729679      C    G         NaN
7        1    751832      T    C         NaN
..     ...       ...    ...  ...         ...
967      1  14458859      C    T         NaN
968      1  14468557      C    T         NaN
969      1  14474265      T    A         NaN
970      1  14490361      A    G   