In [1]:
import hail as hl
import numpy as np
import pandas as pd
import shutil
from pathlib import Path
import yaml
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://ab8c17508922:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.47-d9e1f3a110c8
LOGGING: writing to /home/jovyan/work/repos/sgkit-dev/method/regenie/validation/hail-20200721-1804-0.2.47-d9e1f3a110c8.log


In [5]:
with open('config.yml') as fd:
    config = yaml.load(fd, Loader=yaml.FullLoader)
print(config)

{'datasets': {'sim_01': {'n_variants': 1000, 'n_samples': 250, 'n_contigs': 1, 'n_covars': 3, 'n_traits': 1}}}


In [2]:
dataset_name = 'sim_01'
n_variants = 1000
n_samples = 250
n_contigs = 1
n_covars = 3
n_traits = 1
output_dir = 'data/sim_01'

In [3]:
def add_default_plink_fields(mt):
    return mt.annotate_rows(rsid=hl.null(hl.tstr)).annotate_cols(
        fam_id=hl.null(hl.tstr),
        pat_id=hl.null(hl.tstr),
        mat_id=hl.null(hl.tstr),
        is_female=hl.null(hl.tbool),
        is_case=hl.null(hl.tbool),
    )


def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0):
    data = []
    rs = np.random.RandomState(seed)
    for v in range(n_variants):
        for s in range(n_samples):
            for c in range(n_contigs):
                data.append(
                    {
                        "v": f"{c+1}:{v+1}:A:C",
                        "s": f"S{s+1:07d}",
                        "cm": 0.1,
                        "GT": hl.Call([rs.randint(0, 2), rs.randint(0, 2)]),
                    }
                )
    ht = hl.Table.parallelize(
        data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}")
    )
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(
        row_key=["locus", "alleles"], col_key=["s"], row_fields=["cm"]
    )
    return add_default_plink_fields(mt)

In [4]:
mt = get_plink_sim_dataset(n_variants=n_variants, n_samples=n_samples, n_contigs=n_contigs)
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'fam_id': str
    'pat_id': str
    'mat_id': str
    'is_female': bool
    'is_case': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'cm': float64
    'rsid': str
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [5]:
gt = hl.linalg.BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles()).to_numpy()
gt.shape

2020-07-16 19:47:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 19:47:29 Hail: INFO: Wrote all 1 blocks of 1000 x 250 matrix with block size 4096.


(1000, 250)

In [6]:
gt[:5, :5]

array([[1., 1., 2., 2., 2.],
       [2., 0., 1., 2., 0.],
       [1., 1., 0., 1., 2.],
       [2., 0., 1., 2., 1.],
       [2., 0., 1., 0., 1.]])

In [7]:
sample_ids = mt.s.collect()
len(sample_ids)
sample_ids[:5]

['S0000001', 'S0000002', 'S0000003', 'S0000004', 'S0000005']

In [8]:
def get_covariates(n, sample_ids, seed=0):
    rs = np.random.RandomState(seed)
    df = pd.DataFrame(
        rs.normal(size=(len(sample_ids), n)), 
        columns=[f'X{i:03d}' for i in range(n)]
    )
    df = df.assign(sample_id=sample_ids).set_index('sample_id')
    return df
df_cov = get_covariates(n_covars, sample_ids)
print(df_cov.info())
df_cov.head()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, S0000001 to S0000250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X000    250 non-null    float64
 1   X001    250 non-null    float64
 2   X002    250 non-null    float64
dtypes: float64(3)
memory usage: 7.8+ KB
None


Unnamed: 0_level_0,X000,X001,X002
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S0000001,1.764052,0.400157,0.978738
S0000002,2.240893,1.867558,-0.977278
S0000003,0.950088,-0.151357,-0.103219
S0000004,0.410599,0.144044,1.454274
S0000005,0.761038,0.121675,0.443863


In [9]:
def get_betas(n_traits, gt, df_cov, seed=0):
    rs = np.random.RandomState(seed)
    n_covars = df_cov.shape[1]
    n_variants = gt.shape[0]
    traits = [f'Y{i:04d}' for i in range(n_traits)]
    
    beta_cov = rs.normal(loc=2.0, scale=1, size=(n_covars, n_traits))
    beta_var = rs.normal(loc=-2.0, scale=1, size=(n_variants, n_traits))
    # Set last half of all betas to 0
    beta_cov[(beta_cov.shape[0]//2):,:] = 0
    beta_var[(beta_var.shape[0]//2):,:] = 0
    
    df_beta_cov = pd.DataFrame(
        beta_cov, 
        index=[f'B-{c}' for c in df_cov.columns], 
        columns=traits
    )
    df_beta_var = pd.DataFrame(
        beta_var, 
        index=[f'B-V{i:07d}' for i in range(n_variants)], 
        columns=traits
    )
    return df_beta_cov, df_beta_var

df_beta_cov, df_beta_var = get_betas(n_traits, gt, df_cov)

In [10]:
print(df_beta_cov.info())
df_beta_cov.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, B-X000 to B-X002
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y0000   3 non-null      float64
dtypes: float64(1)
memory usage: 48.0+ bytes
None


Unnamed: 0,Y0000
B-X000,3.764052
B-X001,0.0
B-X002,0.0


In [11]:
print(df_beta_var.info())
df_beta_var.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, B-V0000000 to B-V0000999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y0000   1000 non-null   float64
dtypes: float64(1)
memory usage: 15.6+ KB
None


Unnamed: 0,Y0000
B-V0000000,0.240893
B-V0000001,-0.132442
B-V0000002,-2.977278
B-V0000003,-1.049912
B-V0000004,-2.151357


In [12]:
def get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=.001, seed=0):
    n_variants, n_samples = gt.shape
    assert gt.shape[1] == df_cov.shape[0]
    assert df_beta_var.shape[1] == df_beta_cov.shape[1]
    n_traits = df_beta_var.shape[1]
    rs = np.random.RandomState(seed)
    noise = rs.normal(scale=scale, loc=0, size=(n_samples, n_traits))
    Y =  gt.T @ df_beta_var.values + df_cov.values @ df_beta_cov.values + noise
    df_trait = pd.DataFrame(
        Y,
        index=df_cov.index,
        columns=df_beta_cov.columns
    )
    assert df_trait.notnull().all().all()
    return df_trait
    
df_trait = get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=.001)
print(df_trait.info())
df_trait.head()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, S0000001 to S0000250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y0000   250 non-null    float64
dtypes: float64(1)
memory usage: 3.9+ KB
None


Unnamed: 0_level_0,Y0000
sample_id,Unnamed: 1_level_1
S0000001,-1015.78906
S0000002,-956.278049
S0000003,-969.424269
S0000004,-1025.074713
S0000005,-990.430356


In [13]:
output_path = Path(output_dir)
if output_path.exists():
    shutil.rmtree(output_path)
output_path.mkdir(parents=True)
output_path

PosixPath('data/sim_01')

In [14]:
path = str(output_path / 'genotypes')
hl.export_plink(mt, path)
path

2020-07-16 19:51:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 19:51:01 Hail: INFO: merging 17 files totalling 61.5K...
2020-07-16 19:51:01 Hail: INFO: while writing:
    data/sim_01/genotypes.bed
  merge time: 11.587ms
2020-07-16 19:51:01 Hail: INFO: merging 16 files totalling 23.2K...
2020-07-16 19:51:01 Hail: INFO: while writing:
    data/sim_01/genotypes.bim
  merge time: 7.064ms
2020-07-16 19:51:01 Hail: INFO: merging 16 files totalling 4.9K...
2020-07-16 19:51:01 Hail: INFO: while writing:
    data/sim_01/genotypes.fam
  merge time: 5.692ms
2020-07-16 19:51:01 Hail: INFO: wrote 1000 variants and 250 samples to 'data/sim_01/genotypes'


'data/sim_01/genotypes'

In [15]:
path = str(output_path / 'covariates.csv')
df_cov.reset_index().to_csv(path, index=False)
path

'data/sim_01/covariates.csv'

In [16]:
path = str(output_path / 'traits.csv')
df_trait.reset_index().to_csv(path, index=False)
path

'data/sim_01/traits.csv'

In [17]:
path = str(output_path / 'beta_covariate.csv')
df_beta_cov.to_csv(path, index=True)
path

'data/sim_01/beta_covariate.csv'

In [18]:
path = str(output_path / 'beta_variant.csv')
df_beta_var.to_csv(path, index=True)
path

'data/sim_01/beta_variant.csv'

----------

In [2]:
!conda list

# packages in environment at /opt/conda:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       0_gnu    conda-forge
alembic                   1.4.2              pyh9f0ad1d_0    conda-forge
appdirs                   1.4.4                    pypi_0    pypi
astroid                   2.4.2            py37hc8dfbb8_0    conda-forge
async_generator           1.10                       py_0    conda-forge
attrs                     19.3.0                     py_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports-tempfile        1.0                      pypi_0    pypi
backports-weakref         1.0.post1                pypi_0    pypi
bgen-reader               4.0.4                    pypi_0    pypi
black                     19.10b0                  py37_0    conda-forge
bleach                    3.1.5           