In [10]:
import os
import json
from collections import namedtuple

In [2]:
target_conditions = set(['FRDA', 'DM1', 'HD', 'FXS'])

## Define paths

In [68]:
inputs = '../../input/'
scratch = '../../scratch/'
tools = '../../tools/'
reference = '../../../../common/refs/grch37/human_g1k_v37.fasta'

## Load coriell info

In [19]:
Phenotype = namedtuple('Phenotype', 'id condition status')

phenotypes = ! cat {inputs}/coriell_phenotype_table.tsv
phenotypes = [Phenotype(id=rec.split()[0], condition=rec.split()[1], status=rec.split()[2])
              for rec in phenotypes]

phenotypes = [rec for rec in phenotypes
              if rec.condition in target_conditions and rec.status != 'Normal']

print(len(phenotypes))
phenotypes

91


[Phenotype(id='NA06075', condition='DM1', status='Expansion'),
 Phenotype(id='NA04567', condition='DM1', status='Expansion'),
 Phenotype(id='NA05164', condition='DM1', status='Expansion'),
 Phenotype(id='NA04648', condition='DM1', status='Expansion'),
 Phenotype(id='NA05152', condition='DM1', status='Expansion'),
 Phenotype(id='NA23378', condition='DM1', status='Expansion'),
 Phenotype(id='NA23374', condition='DM1', status='Expansion'),
 Phenotype(id='NA23300', condition='DM1', status='Expansion'),
 Phenotype(id='NA03986', condition='DM1', status='Expansion'),
 Phenotype(id='NA03989', condition='DM1', status='Expansion'),
 Phenotype(id='NA03990', condition='DM1', status='Expansion'),
 Phenotype(id='NA03696', condition='DM1', status='Expansion'),
 Phenotype(id='NA03759', condition='DM1', status='Expansion'),
 Phenotype(id='NA04034', condition='DM1', status='Expansion'),
 Phenotype(id='NA03697', condition='DM1', status='Expansion'),
 Phenotype(id='NA03132', condition='DM1', status='Expan

## Generate manifest for a given repeat

In [31]:
def get_control_records():
    records = []
    dir_path = os.path.join(inputs, 'Diversity/')
    for fname in os.listdir(dir_path):
        sample = fname.replace('.str_profile.json', '')
        file_path = os.path.abspath(os.path.join(dir_path, fname))
        records.append((sample, 'control', file_path))
    return records

In [33]:
len(get_control_records())

150

In [48]:
def get_case_records(condition):
    records = []
    target_samples = [pheno.id for pheno in phenotypes if pheno.condition == condition]
    dir_path = os.path.join(inputs, 'RepeatExpansions')
    for fname in os.listdir(dir_path):
        file_path = os.path.abspath(os.path.join(dir_path, fname))
        sample = fname.replace('.str_profile.json', '')
        if sample in target_samples:
            records.append((sample, 'case', file_path))
    return records

In [56]:
for condition in target_conditions:
    records = get_case_records(condition)
    records.extend(get_control_records())
    
    lines = ['\t'.join(line) for line in records]
    with open(os.path.join(scratch, condition, 'manifest.tsv'), 'w') as manifest_file:
        print('\n'.join(lines), file=manifest_file)

## Perform case-control analysis

In [80]:
%%bash -s "$tools" "$scratch" "$reference"

tools=$1
scratch=$2
reference=$3

for condition in FRDA DM1 HD FXS
do
  $tools/ExpansionHunterDenovo/build/ExpansionHunterDenovo merge \
    --reference $reference \
    --manifest $scratch/$condition/manifest.tsv \
    --output-prefix $scratch/$condition/dataset \
    --min-unit-len 2 \
    --max-unit-len 15
done

for condition in FRDA DM1 HD FXS
do
  $tools/ExpansionHunterDenovo/scripts/casecontrol.py locus \
    --manifest $scratch/$condition/manifest.tsv \
    --min-inrepeat-reads 5 \
    --test-params normal \
    --multisample-profile $scratch/$condition/dataset.multisample_profile.json \
    --output $scratch/$condition/anchored.case-control.tsv
done

[2019-11-04 18:12:43.387] [info] Starting ExpansionHunter Denovo v0.8.0 profile workflow
[2019-11-04 18:12:43.391] [info] Loaded manifest describing 175 samples
[2019-11-04 18:12:43.391] [info] Loading STR profile of NA14519
[2019-11-04 18:12:43.472] [info] Loading STR profile of NA16210
[2019-11-04 18:12:43.551] [info] Loading STR profile of NA16197
[2019-11-04 18:12:43.632] [info] Loading STR profile of NA16240
[2019-11-04 18:12:43.713] [info] Loading STR profile of NA16200
[2019-11-04 18:12:43.792] [info] Loading STR profile of NA16214
[2019-11-04 18:12:43.866] [info] Loading STR profile of NA03816
[2019-11-04 18:12:43.947] [info] Loading STR profile of NA15848
[2019-11-04 18:12:44.030] [info] Loading STR profile of NA16205
[2019-11-04 18:12:44.108] [info] Loading STR profile of NA04079
[2019-11-04 18:12:44.191] [info] Loading STR profile of NA16215
[2019-11-04 18:12:44.260] [info] Loading STR profile of NA15847
[2019-11-04 18:12:44.342] [info] Loading STR profile of NA16227
[2019-1

2019-11-04 18:14:05,966: Loaded 81499 regions
2019-11-04 18:14:05,966: Normalizing counts
2019-11-04 18:14:06,150: Filtering counts
2019-11-04 18:14:06,237: 2263 regions left after filtering
2019-11-04 18:14:06,238: Comparing counts
2019-11-04 18:14:06,699: Correcting p-values
2019-11-04 18:14:06,952: Done
2019-11-04 18:14:07,615: Loaded 80460 regions
2019-11-04 18:14:07,615: Normalizing counts
2019-11-04 18:14:07,811: Filtering counts
2019-11-04 18:14:07,908: 2277 regions left after filtering
2019-11-04 18:14:07,910: Comparing counts
2019-11-04 18:14:08,379: Correcting p-values
2019-11-04 18:14:08,604: Done
2019-11-04 18:14:09,304: Loaded 78810 regions
2019-11-04 18:14:09,304: Normalizing counts
2019-11-04 18:14:09,480: Filtering counts
2019-11-04 18:14:09,565: 2253 regions left after filtering
2019-11-04 18:14:09,567: Comparing counts
2019-11-04 18:14:10,021: Correcting p-values
2019-11-04 18:14:10,240: Done
2019-11-04 18:14:10,920: Loaded 85286 regions
2019-11-04 18:14:10,920: Norma