In [1]:
import sys
sys.path.append('../scripts/')

In [84]:
import argparse
from collections import Counter
from pathlib import Path
import pandas as pd
import sqlite3
from sqlalchemy import (
    create_engine,
    MetaData,
    Table, Column, Integer, Text, Index
)
from sqlalchemy.engine import Engine
from sqlalchemy import event

In [13]:
# Read GDC MAFs
GDC_MAF_MAPPING = []
for p in Path('../../GDC_QC_data/GDC_data_release/Release_10.0').glob('*/TCGA*.maf.gz'):
    _, cancer_type, caller, *__ = p.name.split('.')
    GDC_MAF_MAPPING.append((cancer_type, caller, p))

In [19]:
gdc_mafs_df = pd.DataFrame(GDC_MAF_MAPPING, columns=['cancer_type', 'caller', 'maf_path']).sort_values(['cancer_type', 'caller'])
gdc_mafs_df

Unnamed: 0,cancer_type,caller,maf_path
4,BRCA,muse,../../GDC_QC_data/GDC_data_release/Release_10....
9,BRCA,mutect,../../GDC_QC_data/GDC_data_release/Release_10....
6,BRCA,somaticsniper,../../GDC_QC_data/GDC_data_release/Release_10....
8,BRCA,varscan,../../GDC_QC_data/GDC_data_release/Release_10....
2,COAD,muse,../../GDC_QC_data/GDC_data_release/Release_10....
3,COAD,mutect,../../GDC_QC_data/GDC_data_release/Release_10....
11,COAD,somaticsniper,../../GDC_QC_data/GDC_data_release/Release_10....
7,COAD,varscan,../../GDC_QC_data/GDC_data_release/Release_10....
0,OV,muse,../../GDC_QC_data/GDC_data_release/Release_10....
1,OV,mutect,../../GDC_QC_data/GDC_data_release/Release_10....


In [96]:
list(gdc_mafs_df.itertuples(index=False))

[Pandas(cancer_type='BRCA', caller='muse', maf_path=PosixPath('../../GDC_QC_data/GDC_data_release/Release_10.0/b8ca5856-9819-459c-87c5-94e91aca4032/TCGA.BRCA.muse.b8ca5856-9819-459c-87c5-94e91aca4032.DR-10.0.somatic.maf.gz')),
 Pandas(cancer_type='BRCA', caller='mutect', maf_path=PosixPath('../../GDC_QC_data/GDC_data_release/Release_10.0/995c0111-d90b-4140-bee7-3845436c3b42/TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf.gz')),
 Pandas(cancer_type='BRCA', caller='somaticsniper', maf_path=PosixPath('../../GDC_QC_data/GDC_data_release/Release_10.0/7dd592e3-5950-4438-96d5-3c718aca3f13/TCGA.BRCA.somaticsniper.7dd592e3-5950-4438-96d5-3c718aca3f13.DR-10.0.somatic.maf.gz')),
 Pandas(cancer_type='BRCA', caller='varscan', maf_path=PosixPath('../../GDC_QC_data/GDC_data_release/Release_10.0/6c93f518-1956-4435-9806-37185266d248/TCGA.BRCA.varscan.6c93f518-1956-4435-9806-37185266d248.DR-10.0.somatic.maf.gz')),
 Pandas(cancer_type='COAD', caller='muse', maf_path=PosixPath('.

In [15]:
gdc_mafs_df['maf_path'][0]

PosixPath('../../GDC_QC_data/GDC_data_release/Release_10.0/51423d79-e9c5-4c4d-b12c-99c1338dbd43/TCGA.OV.muse.51423d79-e9c5-4c4d-b12c-99c1338dbd43.DR-10.0.somatic.maf.gz')

In [67]:
mc3_maf_p = Path('../../GDC_QC_data/MC3/mc3.v0.2.8.PUBLIC.GRCh38_converted.maf.gz')
mc3 = MC3MAF(mc3_maf_p)
gdc_brca_muse = GDCMAF(gdc_mafs_df['maf_path'][0])
gdc_brca_mutect = GDCMAF(gdc_mafs_df['maf_path'][1])
gdc_brca_somaticsniper = GDCMAF(gdc_mafs_df['maf_path'][2])
gdc_brca_varscan = GDCMAF(gdc_mafs_df['maf_path'][3])

In [68]:
# Make sure all the GDC MAFs have the same fields
all(
    maf._record._fields == gdc_brca_muse._record._fields 
    for maf in [gdc_brca_mutect, gdc_brca_somaticsniper, gdc_brca_varscan]
)

True

In [80]:
metadata = MetaData()
db_engine = create_engine('sqlite:///variants.sqlite')

In [83]:
metadata.create_all(db_engine, checkfirst=True)

In [85]:
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA cache_size=-4192000")
    cursor.execute("PRAGMA temp_store=MEMORY")
    cursor.execute("PRAGMA journal_mode=MEMORY")
    cursor.close()

In [92]:
BATCH_SIZE = 1000
conn = db_engine.connect()

# Insert mutations
ins = metadata.tables['mc3'].insert()
ins_batch = []
for i, record in enumerate(mc3, 1):
    if i % 10000 == 0:
        print(f'Inserted {i} records')
    if len(ins_batch) >= BATCH_SIZE:
        with conn.begin():
            conn.execute(ins, ins_batch)
        ins_batch = []
    ins_batch.append(record._asdict())
    
    
if ins_batch:
    conn.execute(ins, ins_batch)

Inserted 10000 records
Inserted 20000 records
Inserted 30000 records
Inserted 40000 records


KeyboardInterrupt: 

In [93]:
conn.close()