In [1]:
import sqlite3
import pandas as pd

In [2]:
conn = sqlite3.connect('../variants.sqlite')
conn.executescript('''\
PRAGMA cache_size=-4192000;
PRAGMA temp_store=MEMORY;
PRAGMA journal_mode=OFF;
''')

<sqlite3.Cursor at 0x105143ab0>

In [8]:
df_variant_class = pd.merge(
    pd.read_sql('''\
SELECT variant_classification, count(*) AS 'mc3_count'
FROM mc3_selected
GROUP BY variant_classification
ORDER BY mc3_count DESC
''', conn),
    pd.read_sql('''\
SELECT variant_classification, count(*) AS 'gdc_count'
FROM gdc_grouped_callers
GROUP BY variant_classification
ORDER BY gdc_count DESC
''', conn),
    how='outer',
    on='variant_classification'
)

In [9]:
df_variant_class.sum()

variant_classification    Missense_MutationSilent3'UTRFrame_Shift_DelNon...
mc3_count                                                            445856
gdc_count                                                            515859
dtype: object

In [34]:
df_per_tx_sample_variant_type = pd.read_sql('''\
WITH gdc_by_tx AS (
    SELECT hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode, count(*) AS 'gdc_count'
    FROM gdc_grouped_callers
    GROUP BY hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode
), mc3_by_tx AS (
    SELECT hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode, count(*) AS 'mc3_count'
    FROM mc3_selected
    GROUP BY hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode
)
SELECT * FROM gdc_by_tx
LEFT JOIN mc3_by_tx 
USING (hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode)
UNION ALL
SELECT * FROM mc3_by_tx
LEFT JOIN gdc_by_tx
USING (hugo_symbol, transcript_id, variant_classification, tumor_sample_barcode)
WHERE gdc_by_tx.gdc_count IS NULL
''', conn)

In [35]:
df_per_tx_sample_variant_type = (
    df_per_tx_sample_variant_type
    .fillna(0)
    .assign(
        gdc_count=lambda df: df.gdc_count.astype(int),
        mc3_count=lambda df: df.mc3_count.astype(int))
    .assign(
        diff=lambda df: df.gdc_count-df.mc3_count
    )
    .sort_values(['diff'], ascending=False)
)

In [36]:
df_per_tx_sample_variant_type.head()

Unnamed: 0,hugo_symbol,transcript_id,variant_classification,tumor_sample_barcode,gdc_count,mc3_count,diff
595842,TTN,ENST00000589042,Missense_Mutation,TCGA-CA-6717-01A-11D-1835-10,31,0,31
595727,TTN,ENST00000589042,Missense_Mutation,TCGA-AA-A010-01A-01D-A17O-10,30,0,30
437991,TTN,ENST00000591111,Missense_Mutation,TCGA-AA-A010-01A-01D-A17O-10,29,0,29
438110,TTN,ENST00000591111,Missense_Mutation,TCGA-CA-6717-01A-11D-1835-10,29,0,29
438021,TTN,ENST00000591111,Missense_Mutation,TCGA-AN-A046-01A-21W-A050-09,28,0,28


In [40]:
df_per_tx_sample = (
    df_per_tx_sample_variant_type
    .groupby(['hugo_symbol', 'transcript_id', 'tumor_sample_barcode'])
    .sum()
    .sort_values(['diff'], ascending=False)
    .reset_index()
)

In [41]:
df_per_tx_sample.head()

Unnamed: 0,hugo_symbol,transcript_id,tumor_sample_barcode,gdc_count,mc3_count,diff
0,TTN,ENST00000589042,TCGA-CA-6717-01A-11D-1835-10,69,0,69
1,TTN,ENST00000591111,TCGA-CA-6717-01A-11D-1835-10,68,0,68
2,TTN,ENST00000591111,TCGA-AA-A010-01A-01D-A17O-10,48,0,48
3,TTN,ENST00000589042,TCGA-AA-A010-01A-01D-A17O-10,48,0,48
4,TTN,ENST00000591111,TCGA-AN-A046-01A-21W-A050-09,45,0,45


In [43]:
df_per_tx = (
    df_per_tx_sample.groupby(['hugo_symbol', 'transcript_id'])
    .sum()
    .sort_values(['diff'], ascending=False)
    .reset_index()
)

In [53]:
df_per_tx.head(20)

Unnamed: 0,hugo_symbol,transcript_id,gdc_count,mc3_count,diff
0,TTN,ENST00000591111,1901,0,1901
1,TTN,ENST00000589042,1670,0,1670
2,APC,ENST00000257430,616,0,616
3,APC,ENST00000457016,555,0,555
4,OBSCN,ENST00000570156,386,0,386
5,DST,ENST00000312431,379,0,379
6,OBSCN,ENST00000422127,378,0,378
7,MUC4,ENST00000463781,493,131,362
8,TP53,ENST00000269305,1306,977,329
9,DST,ENST00000244364,324,0,324


In [55]:
# Export data to excel
df_per_tx.to_csv('../variant_count_diff.per_tx.csv', index=False)
df_per_tx_sample.to_csv('../variant_count_diff.per_tx_sample.csv', index=False)
df_per_tx_sample_variant_type.to_csv('../variant_count_diff.per_tx_sample_type.csv', index=False)

In [68]:
df_per_tx_sample[df_per_tx_sample['transcript_id'].isin(['ENST00000257430', 'ENST00000457016'])].head(10)

Unnamed: 0,hugo_symbol,transcript_id,tumor_sample_barcode,gdc_count,mc3_count,diff
89,APC,ENST00000457016,TCGA-CA-6717-01A-11D-1835-10,10,0,10
126,APC,ENST00000257430,TCGA-CA-6717-01A-11D-1835-10,9,0,9
304,APC,ENST00000257430,TCGA-AA-A010-01A-01D-A17O-10,7,0,7
306,APC,ENST00000257430,TCGA-AN-A046-01A-21W-A050-09,7,0,7
309,APC,ENST00000257430,TCGA-CM-6166-01A-11D-1650-10,7,0,7
319,APC,ENST00000457016,TCGA-CM-6166-01A-11D-1650-10,7,0,7
323,APC,ENST00000457016,TCGA-AN-A046-01A-21W-A050-09,7,0,7
328,APC,ENST00000457016,TCGA-AA-A010-01A-01D-A17O-10,7,0,7
503,APC,ENST00000457016,TCGA-A6-2675-01A-02D-1719-10,6,0,6
801,APC,ENST00000257430,TCGA-SS-A7HO-01A-21D-A36X-10,5,0,5


## TP53

Check if there is any flag for the variants of TP53.

In [71]:
sample = 'TCGA-WR-A838-01A-12D-A403-09'
gene = 'TP53'
pd.read_sql(
    '''
    SELECT *
    FROM gdc_grouped_callers
    WHERE hugo_symbol=? AND tumor_sample_barcode=?
    ''',
    conn,
    params=[gene, sample]
)

Unnamed: 0,hugo_symbol,entrez_gene_id,center,ncbi_build,chromosome,start_position,end_position,strand,variant_classification,variant_type,...,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,gdc_filter,cosmic,mc3_overlap,gdc_validation_status,cancer_type,"group_concat(caller, '|')"
0,TP53,7157,WUGSC,GRCh38,chr17,7675124,7675124,+,Missense_Mutation,SNP,...,d410118f-5ac1-4487-9e5b-6a91915e4a7b,fe8f1f10-e7c5-4faa-815e-c0f7b9d78767,fc0e1648-2ae8-4f56-8ebb-c93e3b1c48f5,b8023162-5e82-40e6-ad8c-8acf81821f01,,COSM10808;COSM129852;COSM129853;COSM129854;COS...,True,Unknown,OV,somaticsniper|muse|varscan|mutect
1,TP53,7157,WUGSC,GRCh38,chr17,7675229,7675229,+,Frame_Shift_Del,DEL,...,2318c30f-26cc-48a0-a5e3-86398c54d979,fe8f1f10-e7c5-4faa-815e-c0f7b9d78767,fc0e1648-2ae8-4f56-8ebb-c93e3b1c48f5,b8023162-5e82-40e6-ad8c-8acf81821f01,,COSM5198771;COSM5198772;COSM5198773;COSM519877...,True,Unknown,OV,varscan
2,TP53,7157,WUGSC,GRCh38,chr17,7675232,7675232,+,Missense_Mutation,SNP,...,3c685c63-6618-403d-979f-f5a69b9d805b,fe8f1f10-e7c5-4faa-815e-c0f7b9d78767,fc0e1648-2ae8-4f56-8ebb-c93e3b1c48f5,b8023162-5e82-40e6-ad8c-8acf81821f01,,COSM3403294;COSM3403295;COSM3403296;COSM340329...,True,Unknown,OV,somaticsniper
3,TP53,7157,WUGSC,GRCh38,chr17,7675232,7675232,+,Missense_Mutation,SNP,...,2318c30f-26cc-48a0-a5e3-86398c54d979,fe8f1f10-e7c5-4faa-815e-c0f7b9d78767,fc0e1648-2ae8-4f56-8ebb-c93e3b1c48f5,b8023162-5e82-40e6-ad8c-8acf81821f01,,COSM3403294;COSM3403295;COSM3403296;COSM340329...,True,Unknown,OV,varscan
