## Hail Bgen Extracts

In [1]:
import hail as hl
from pathlib import Path
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://6fb6b6adfa3a:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.44-6cfa355a1954
LOGGING: writing to /work/repos/docker-hail/hail-20200608-2230-0.2.44-6cfa355a1954.log


In [2]:
export_dir = Path('/work/data/hail-datasets/exports')

### Balding-Nichols

In [13]:
mt = hl.balding_nichols_model(n_populations=1, n_variants=100_000, n_samples=1_000)
# Col key must be string
mt = mt.annotate_cols(s=hl.str(mt.sample_idx)).key_cols_by('s')
mt.describe()

----------------------------------------
Global fields:
    'bn': struct {
        n_populations: int32, 
        n_samples: int32, 
        n_variants: int32, 
        n_partitions: int32, 
        pop_dist: array<int32>, 
        fst: array<float64>, 
        mixture: bool
    }
----------------------------------------
Column fields:
    'sample_idx': int32
    'pop': int32
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'ancestral_af': float64
    'af': array<float64>
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


2020-06-08 23:04:37 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 1000 samples, and 100000 variants...


In [14]:
path = str(export_dir / 'balding-nichols-1pop-100kvariants-1ksamples')
hl.export_bgen(mt, path)
path

2020-06-08 23:04:42 Hail: INFO: Coerced sorted dataset
2020-06-08 23:04:44 Hail: INFO: while writing:
    /work/data/hail-datasets/exports/balding-nichols-1pop-100kvariants-1ksamples.bgen
  merge time: 25.254ms


'/work/data/hail-datasets/exports/balding-nichols-1pop-100kvariants-1ksamples'

### HapMap

In [3]:
mt = hl.read_matrix_table('/work/data/tutorial/1_QC_GWAS/HapMap_3_r3_1.mt')
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'fam_id': str
    'pat_id': str
    'mat_id': str
    'is_female': bool
    'is_case': bool
----------------------------------------
Row fields:
    'locus': locus<hapmap3_hg18>
    'alleles': array<str>
    'rsid': str
    'cm_position': float64
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [4]:
path = str(export_dir / 'HapMap.phase_3')
hl.export_bgen(mt, path)
path

2020-06-08 22:32:56 Hail: INFO: while writing:
    /work/data/hail-datasets/exports/HapMap.phase_3.bgen
  merge time: 209.473ms


'/work/data/hail-datasets/exports/HapMap.phase_3'

### 1KG chrX

From: gs://hail-datasets-hail-data/1000_Genomes_chrX.phase_3.GRCh38.mt

In [5]:
mt = hl.read_matrix_table('/work/data/hail-datasets/1000_Genomes_chrX.phase_3.GRCh38.mt')
mt.describe()

----------------------------------------
Global fields:
    'metadata': struct {
        name: str, 
        version: str, 
        reference_genome: str, 
        n_rows: int32, 
        n_cols: int32, 
        n_partitions: int32
    }
----------------------------------------
Column fields:
    's': str
    'population': str
    'super_population': str
    'is_female': bool
    'family_id': str
    'relationship_role': str
    'maternal_id': str
    'paternal_id': str
    'children_ids': array<str>
    'sibling_ids': array<str>
    'second_order_relationship_ids': array<str>
    'third_order_relationship_ids': array<str>
    'sample_qc': struct {
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_hom_ref: int64, 
        n_het: int64, 
        n_hom_var: int64, 
        n_non_ref: int64, 
        n_singleton: int64, 
        n_snp: int64, 
        n_insertion: int64, 
        n_deletion: int64, 
        n_transition: int64, 
        n_trans

In [6]:
path = str(export_dir / '1000_Genomes_chrX.phase_3.GRCh38')
hl.export_bgen(mt, path)
path

2020-06-08 22:33:24 Hail: INFO: while writing:
    /work/data/hail-datasets/exports/1000_Genomes_chrX.phase_3.GRCh38.bgen
  merge time: 262.117ms


'/work/data/hail-datasets/exports/1000_Genomes_chrX.phase_3.GRCh38'