In [2]:
import sqlite3
import pandas as pd

In [3]:
conn = sqlite3.connect('../processed_data/all_variants.sqlite')
conn.executescript('''\
PRAGMA cache_size=-4192000;
PRAGMA temp_store=MEMORY;
PRAGMA journal_mode=OFF;
''')

<sqlite3.Cursor at 0x112cdb1f0>

Generate an overlap by the genomic location

In [7]:
%%time
FIELDS_TO_REPORT = '''\
    chromosome, start_position, end_position, tumor_sample_barcode, reference_allele, tumor_seq_allele2,
    g.tumor_seq_allele1 AS gdc_tumor_seq_allele1, 
    m.tumor_seq_allele1 AS mc3_tumor_seq_allele1,
    g.variant_classification AS gdc_variant_classification, 
    m.variant_classification AS mc3_variant_classification, 
    g.variant_type AS gdc_variant_type,
    m.variant_type AS mc3_variant_type,
    g.transcript_id AS gdc_transcript_id,
    g.callers AS gdc_callers,
    m.centers AS mc3_callers, 
    m.ncallers AS mc3_ncallers
'''

df_exact_overlap = pd.read_sql(
f'''\
WITH gdc_simple AS (
    SELECT 
        chromosome, start_position, end_position, tumor_sample_barcode,
        variant_classification, variant_type, transcript_id,
        reference_allele, tumor_seq_allele1, tumor_seq_allele2,
        t_depth_per_caller, t_ref_count_per_caller, t_alt_count_per_caller,
        n_depth_per_caller, callers
    FROM gdc_grp_shared_samples
), mc3_simple AS (
    SELECT 
        'chr' || chromosome AS chromosome, start_position, end_position, tumor_sample_barcode,
        variant_classification, variant_type, transcript_id,
        reference_allele, tumor_seq_allele1, tumor_seq_allele2,
        t_depth, t_ref_count, t_alt_count, n_depth,
        centers, ncallers
    FROM mc3_selected
) 
SELECT {FIELDS_TO_REPORT}
FROM gdc_simple g
LEFT JOIN mc3_simple m
    USING (tumor_sample_barcode, chromosome, start_position, end_position, reference_allele, tumor_seq_allele2)
UNION ALL
SELECT {FIELDS_TO_REPORT}
FROM mc3_simple m
LEFT JOIN gdc_simple g
    USING (tumor_sample_barcode, chromosome, start_position, end_position, reference_allele, tumor_seq_allele2)
WHERE g.tumor_seq_allele1 IS NULL
''', conn)

CPU times: user 38.6 s, sys: 4.08 s, total: 42.7 s
Wall time: 43.2 s


In [14]:
df_exact_overlap = df_exact_overlap.assign(
    shared_by_gdc_mc3=lambda df: df['mc3_callers'].notnull() & df['gdc_callers'].notnull(),
    only_in_gdc=lambda df: df['gdc_callers'].notnull() & df['mc3_callers'].isnull(),
    only_in_mc3=lambda df: df['gdc_callers'].isnull() & df['mc3_callers'].notnull()
)

In [19]:
df_exact_overlap[['shared_by_gdc_mc3', 'only_in_gdc', 'only_in_mc3']].sum()

shared_by_gdc_mc3    412388
only_in_gdc           99336
only_in_mc3           37919
dtype: int64

In [21]:
df_exact_overlap.to_csv('../processed_data/gdc_mc3_overlap_by_genomic_location.csv', index=False)