# Genotype data formatting utilities

This module implements a collection of workflows used to format genotype data.

## Input

The module streamlines conversion between PLINK and VCF formats (possibly more to add), specifically:

1. Conversion between VCF and PLINK formats
2. Split data by chromosomes
3. Merge data by chromosomes
4. Split data by genes

Depending on the analysis task, input files are specified in one of the following formats:

1. A single Whole genome data in VCF format, or in PLINK bim/bed/fam bundle; Or,
2. A list of per chromosome VCF or PLINK data (without suffix)
3. A file containing a list of per chromosome VCF or PLINK data (without suffix)

## Output

Genotype data after reformatting.

## Examples

Minimal working example data-set as well as the singularity container `bioinfo.sif` can be downloaded from [Google Drive](https://drive.google.com/drive/u/0/folders/1ahIZGnmjcGwSd-BI91C9ayd_Ya8sB2ed).

### PLINK file merger

```
sos run genotype_formatting.ipynb merge_plink \
    --genoFile data/genotype/chr1.bed data/genotype/chr6.bed \
    --cwd output/genotype \
    --name chr1_chr6 \
    --container container/bioinfo.sif
```

In [2]:
[global]
import os
# Work directory & output directory
parameter: cwd = path
# The filename name for containers
parameter: container = ''
# File prefix for the analysis output
parameter: name = str
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# the path to bed file for a merged, whole genomoe, Genotype file in plink trio format
parameter: genoFile = path
# Cis Windows
parameter: window = 500000

# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]

import os


# FIXME: think of input: be a file of regions?
# Move away from global statement
parameter: region_list = ''

if region_list:
    regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
    chrom = list(set(extract(regions)))
if not container:
    container = None
# use this function to edit memory string for PLINK input
from sos.utils import expand_size
cwd = f"{cwd:a}"
# FIXME: For compatibility before I change everyting to `cwd`
wd = cwd


# List of Genotype file in plink trio format
parameter: genotype_list = path

chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))

import os
import pandas as pd
def get_genotype_file(chrom, genotype_list, geno_inventory):
    chrom = f'{chrom}'
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    if chrom not in geno_inventory:
        geno_file = f'{chrom}'
    else:
        geno_file = geno_inventory[chrom]
    if not os.path.isfile(geno_file):
        # relative path
        if not os.path.isfile(f'{genotype_list:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = f'{genotype_list:ad}/' + geno_file
    return path(geno_file)

geno_inventory = dict([x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')])
genotype_dir = path(pd.read_csv(genotype_list,sep = "\t").values.tolist()[0][1])

### Plink to VCF transformation


In [None]:
[plink2vcf_1]
input: genoFile, for_each = "chrom"
output: f'{wd:a}/{name}_per_chrom_vcf/{name}_chr{_chrom}.vcf.gz',
        f'{wd:a}/{name}_per_chrom_vcf/{name}_chr{_chrom}.vcf.gz.tbi'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    plink --bfile $[genoFile:n] \
    --recode vcf-iid       --out $[_output[0]:nn] --chr $[_chrom]
    bgzip $[_output[0]:n]  
    tabix -f -p vcf $[_output[0]]

In [None]:
[plink2vcf_2]
input: group_by = "all"
output: f'{wd:a}/{name}_per_chrom_vcf/{name}.vcf_chrom_list.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    chrom = c($[",".join(chrom)])
    dir = "$[_output:nn]"
    geno_list = tibble(`#chr` = chrom, dir = map_chr(`#chr`,~paste(c(dir,"_chr",.x,".vcf.gz"),collapse ="")))%>%arrange(`#chr`)
    geno_list%>%write_delim("$[_output]","\t")

### Partition Plink by genes


In [None]:
[plink_by_gene_1]
input: genoFile,  for_each = 'regions'
output: f'{wd:a}/{name}_per_gene_plink/{_regions[3]}.bed'

task: trunk_workers = 1, trunk_size = job_size, walltime = '12h',  mem = '6G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    ##### Get the locus genotypes for $[_regions[3]]
    plink --bfile $[genoFile:an] \
    --make-bed \
    --out $[_output[0]:n] \
    --chr $[_regions[0]] \
    --from-bp $[f'1' if (int(_regions[1]) - window) < 0 else f'{(int(_regions[1]) - window)}'] \
    --to-bp $[int(_regions[2]) + window ] \
    --allow-no-sex || true
    
    touch $[_output]

In [None]:
[plink_by_gene_2]
input: group_by = "all"
output: f'{wd:a}/{name}_per_gene_plink/{name}.plink_gene_list.txt'
import pandas as pd
df = pd.DataFrame({"region" : [x[3] for x in regions] ,"dir" : _input})
df.to_csv(_output,sep = "\t",index = 0)

### Partition Plink by Chrom


In [None]:
[plink_by_chrom_1]
input: genoFile, for_each = "chrom"
output: f'{wd:a}/{name}_per_chrom_plink/{name}_chr{_chrom}.bed'
# look up for genotype file
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h',  mem = '6G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    ##### Get the locus genotypes for $[_chrom]
    plink --bfile $[_input:an] \
    --make-bed \
    --out $[_output[0]:n] \
    --chr $[_chrom[0]] \
    --allow-no-sex || true

In [None]:
[plink_by_chrom_2]
input: group_by = "all"
output: f'{wd:a}/{name}_per_chrom_plink/{name}.plink_chrom_list.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    chrom = c($[",".join(chrom)])
    dir = "$[_output:nn]"
    geno_list = tibble(`#chr` = chrom, dir = map_chr(`#chr`,~paste(c(dir,"_chr",.x,".bed"),collapse ="")))%>%arrange(`#chr`)
    geno_list%>%write_delim("$[_output]","\t")

## Merge plink files

In [None]:
[merge_plink]
# Input is a list of PLINK bed files
parameter: genoFile = paths
skip_if(len(genoFile) == 1)
input: genoFile, group_by = 'all'
output: f"{cwd}/{name}.merge_list", f"{cwd}/{name}.bed"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[1]:bn}'

with open(_output[0], 'w') as f:
    f.write('\n'.join([str(f'{x:n}') for x in _input[1:]]))

bash: container=container, expand= "${ }", stderr = f'{_output[1]:n}.stderr', stdout = f'{_output[1]:n}.stdout'
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output[0]} \
    --make-bed \
    --out ${_output[1]:n} \
    --threads ${numThreads} \
    --memory ${int(expand_size(mem) * 0.9)/1e06}

## Merge VCF files

In [None]:
[merge_vcf]
# Input is a list of PLINK bed files
parameter: genoFile = paths
skip_if(len(genoFile) == 1)
input: genoFile, group_by = 'all'
output:  f"{cwd}/{name}.vcf.gz"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bcftools concat -Oz ${_input} > ${_output}
    tabix -p vcf ${_output}

In [None]:
import pandas as pd
parameter: genotype_list = path
geno_file_inv = pd.read_csv(genotype_list, sep = "\t")
genoFile = geno_file_inv["dir"].values.tolist()