# Liftover summary stats from hg19 to hg38

The aim of this notebook is to do liftover of summary statistics data from hg19 to hg38

## Pre-requisites

### Two way to use this pipelin in csglogin

`export PATH=/home/yh3455/miniconda3/bin:$PATH`

### Or insatll the following packages in your env

Make sure you install the pre-requisited before running this notebook:

```
pip install LDtoolsets
```

## Illustration on how to run this notebook

```
sos run Liftover_Sumstat.ipynb \
    --input_path sumstats.snp_stats.gz\
    --cwd  output \
    --container_lmm lmm.sif
```

In [None]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Input file (sumstats to do liftover)
parameter: input_path = path
parameter: fr = 'hg19'
# From reference genome, defaut is hg19
parameter: to = 'hg38'
# To reference genome, defaut is hg38
# Remove unmapped coordinates from the sumstats (defaults to True)
parameter: remove_missing = True
# Container
parameter: container_lmm = 'statisticalgenetics/lmm:3.0'

In [None]:
[default_1 (export utils script)]
depends:  Py_Module('LDtools')
output: f'{cwd:a}/utils.py'
report:container=container_lmm, expand = '${ }', output=f'{cwd:a}/utils.py'

    from LDtools.sumstat import Sumstat
    from LDtools.liftover import Liftover
    def main(input_path,output_path,output_unmapped,output_mapped,fr='hg19',to='hg38',remove_missing=True):
        lf = Liftover(fr,to)
        print("reading GWAS sumstat")
        sums = Sumstat(input_path)
        print("liftover from" + fr +"to" +to)
        sums1 = lf.sumstat_liftover(sums.ss)
        if remove_missing:
            sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
            sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
        else:
            sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)

In [None]:
[default_2 (do liftover)]
depends: f'{cwd:a}/utils.py'
input: input_path
output: sumstats_lifted = f'{cwd}/{_input:bnn}.hg38.sumstats.gz',
        sumstats_unmapped = f'{cwd}/{_input:bnn}.hg38.sumstats_unmapped.gz',
        sumstats_mapped = f'{cwd}/{_input:bnn}.hg38.sumstats_mapped.gz'
task: trunk_workers = 1, job_size=1, walltime = '1h', mem = '8G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
python:container=container_lmm, input = f'{cwd:a}/utils.py', expand = '${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    
    input_path=${_input:r}
    output_path=${_output[0]:r}
    output_unmapped=${_output[1]:r}
    output_mapped=${_output[2]:r}
    fr = ${fr}
    to = ${to}
    remove_missing=${remove_missing}

    main(input_path,output_path,output_unmapped,output_mapped,fr,to,remove_missing)