# Loading the Project GENIE Cohort

In [1]:
import csv
from collections import defaultdict, OrderedDict
from timeit import default_timer
import pandas as pd

In [2]:
def get_coordinates_from_genie_record(record):
    assert record['NCBI_Build'] == 'GRCh37'
    chromosome = str(record['Chromosome'])
    start = int(record['Start_Position'])
    stop = int(record['End_Position'])
    if record['Reference_Allele'] != record['Tumor_Seq_Allele1']:
        alt = record['Tumor_Seq_Allele1']
    else:
        alt = record['Tumor_Seq_Allele2']
    if alt == '-':
        alt = None
    d = OrderedDict([
        ('chr', chromosome),
        ('start', start),
        ('stop', stop),
        ('alt', alt),
        ('barcode', record['Tumor_Sample_Barcode'])
    ])
    return d

def genie_record_generator(records):
    for r in records:
        yield get_coordinates_from_genie_record(r)
        
def genie_caster(records):
    for r in records:
        d = OrderedDict([
            ('chr', r['chr']),
            ('start', int(r['start'])),
            ('stop', int(r['stop'])),
            ('alt', r['alt']),
            ('barcode', r['barcode'])
        ])
        yield d

In [3]:
# GENIE data downloaded from https://www.synapse.org/#!Synapse:syn17394041

with open('data/data_mutations_extended_5.0-public.txt', 'r') as f:
    genie_samples = next(f).split()[1:]
    genie_file_reader = csv.DictReader(f, delimiter='\t')
    df = pd.DataFrame.from_dict(genie_record_generator(genie_file_reader))

In [15]:
# sort and save the GENIE data in a compatible format

df.columns = ['chr', 'start', 'stop', 'alt', 'key']
df.sort_values(by=['chr', 'start', 'stop', 'alt', 'key'], inplace=True)
df.to_csv('data/genie_5.0_sorted.txt', sep='\t', index=False)

# Searching CIViC

In [16]:
from civicpy import civic
from collections import Counter
civic.load_cache()

In [17]:
%%timeit
coords = civic.CoordinateQuery(chr='7', start=140453136, stop=140453136)
civic.search_variants_by_coordinates(coords, search_mode='any')

80.9 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
coords = civic.CoordinateQuery(chr='7', start=140453136, stop=140453136)
x = civic.search_variants_by_coordinates(coords, search_mode='include_larger')
len(x)

19

In [19]:
match_names = [v.name for v in x]
for _ in range(0, len(match_names), 3):
    print(match_names[_:_+3])

['AGK-BRAF', 'TRIM24-BRAF', 'AKAP9-BRAF']
['PAPSS1-BRAF', 'BRAF-CUL1', 'AMPLIFICATION']
['V600D', 'WILD TYPE', 'V600_K601DELINSD']
['KIAA1549-BRAF', 'V600E+V600M', 'V600E AMPLIFICATION']
['ZKSCAN1-BRAF', 'V600E', 'PPFIBP2-BRAF']
['V600R', 'V600', 'V600K']
['MUTATION']


In [30]:
tick = default_timer()
records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]
exact_results = civic.bulk_search_variants_by_coordinates(records, search_mode='exact')
tock = default_timer()
print(tock-tick)

394.33967355799996


In [31]:
tick = default_timer()
records = [civic.CoordinateQuery(**x) for x in df.to_dict('records')]
permissive_results = civic.bulk_search_variants_by_coordinates(records, search_mode='any')
tock = default_timer()
print(tock-tick)

418.74014411200017


# DELETE EVERYTHING BELOW THIS

In [105]:
from importlib import reload
reload(civic)
civic.load_cache()

In [85]:
df.start[df.start.sort_values() <= 533873].index

Int64Index([103], dtype='int64')

In [88]:
x = df.start.sort_values()
a = x[x <= 36932096].index
b = x[x >= 36932096].index

In [89]:
a & b

Int64Index([16], dtype='int64')

In [140]:
v600e = civic.get_variant_by_id(12)

In [62]:
missed_results = list()
with open('data/genie_5.0_sorted.txt', 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for r in reader:
        v = r.values()
        if v not in exact_results:
            missed_results.append(v)

In [97]:
len(genie_samples)

59437

In [21]:
len(civic.CACHE)

2696

In [22]:
len(civic.COORDINATE_TABLE)

1520

In [24]:
civic.COORDINATE_TABLE.shape

(1520, 5)