# Genome-wide Linkage Analysis

## Aim

To phase haplotypes from vcfs and run genome-wide linkage analysis

### Input


- `--cwd`, work directory where the output will be saved to
- `--chrom`, including a list of chromosomes
    - e.g. `1 2 3`
- `--fam-path`, the path of the fam file.
- `--vcf-path`, the path of a genotype file in `vcf` format.
- `--anno-path`, the path of a annotation file.
- `--anno-path`, the path of a sample source file.

### Output
- haplotypes
- lods

## Command Interface

sos run seqlink_sos.ipynb -h

## Example command

### seqlink
```
sos run nbs/seqlink_sos.ipynb seqlink --cwd data/wg20220316 --fam_path data/new_trim_ped_famless17_no:xx.fam --vcf_path /mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz --anno_path MWE/annotation --pop_path data/full_sample_fam_pop.txt --chrom 9 10
```
### linkage
```
sos run nbs/seqlink_sos.ipynb linkage --cwd data/wg20220316 --fam_path data/new_trim_ped_famless17_no:xx.fam --chrom 1 2 3 4 5 6 7 8
```
### seqlink and linkage
```
sos run nbs/seqlink_sos.ipynb --cwd data/wg20220316 --fam_path data/new_trim_ped_famless17_no:xx.fam --vcf_path /mnt/mfs/statgen/alzheimers-family/linkage_files/geno/full_sample/vcf/full_sample.vcf.gz --anno_path MWE/annotation --pop_path data/full_sample_fam_pop.txt --chrom 9 10
```

## Workflow codes

In [None]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Fam file
parameter: fam_path = path
parameter: chrom = list
parameter: walltime = '24h'
parameter: mem = '128G'

In [None]:
[seqlink (phasing haps)]
# VCF file
parameter: vcf_path = path
# annotation path
parameter: anno_path = path
# Sample source file path
parameter: pop_path = path
input: fam_path, vcf_path, anno_path, pop_path, for_each = 'chrom'
output: f'{cwd:a}/chr{_chrom}test'
task: walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
bash: expand = '${ }', stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    
    echo "start"
    seqlink --fam ${fam_path} --vcf ${vcf_path} \
    --anno '${anno_path:a}/EFIGA_NIALOAD_chr${_chrom}.hg38.hg38_multianno.csv' \
    --pop ${pop_path} \
    -o ${_output}  \
    -f 'MERLIN' --build 'hg38' --freq 'AF' --bin 1 --maf-cutoff 0.05 --jobs 1

In [None]:
[rho (cutoffnone rho0)]
input: cwd, fam_path, for_each = 'chrom'
output: f'{cwd:a}/chr{_chrom}test/tmp/CACHE/chr{_chrom}test'
task: trunk_workers = 10, trunk_size=10, walltime = walltime, mem = mem, cores = 10, tags = f'{step_name}_{_output:bn}'
python: expand = '${ }', stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'

    import os.path
    import glob
    import pandas as pd
    import numpy as np
    import pickle
    from SEQLinkage.linkage import *
    
    def run_gene_lods(file,fam,rho=0,cutoff=None):
        with open(file+'.pickle', 'rb') as handle:
            genes = pickle.load(handle)
        gene_variants,gene_fam_haps = format_haps_bunch(genes,fam)
        if cutoff is not None:
            for f,variants in gene_variants.items():
                gene_fam_haps[f]=gene_fam_haps[f].loc[:,[True]*6+list(np.repeat((variants.freqs>cutoff)[variants.uniq],2))]
        res = parallel_lods(gene_fam_haps.values(),rho)
        smy_res = sum_variant_lods(res)
        with open(file+'cutoff'+str(cutoff)+'_rho'+str(rho)+'.result','wb') as handle:
            pickle.dump(smy_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

    fam17 = pd.read_csv(f'${fam_path}',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
    fam17.index = list(fam17.iid)
    fam17.ad[fam17.ad==-9]=0
    fam17_d = {}
    for i in fam17.fid.unique():
        fam17_d[i] = fam17[fam17.fid==i]
    inputs=glob.glob(f'${_input[0]}/chr${_chrom}test/tmp/CACHE/chr${_chrom}test*.pickle')
    for i in inputs:
        r=0
        print(i[:-7],r)
        file = i[:-7]+'cutoff'+str(None)+'_rho'+str(r)+'.result'
        if os.path.isfile(file):
            print('exist! jump',file)
        else:
            run_gene_lods(i[:-7],fam17_d,r)

In [None]:
[linkage (linkage analysis)]
input: cwd, fam_path, for_each = 'chrom'
output: f'{cwd:a}/chr{_chrom}test/tmp/CACHE/chr{_chrom}test'
task: walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
python: expand = '${ }', stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'

    import os.path
    import glob
    import pandas as pd
    import numpy as np
    import pickle
    from SEQLinkage.linkage import *
    
    def run_gene_lods(file,fam,rho=0.0,cutoff=None):
        with open(file+'.pickle', 'rb') as handle:
            genes = pickle.load(handle)
        gene_variants,gene_fam_haps = format_haps_bunch(genes,fam)
        if cutoff is not None:
            for f,variants in gene_variants.items():
                gene_fam_haps[f]=gene_fam_haps[f].loc[:,[True]*6+list(np.repeat((variants.freqs>cutoff)[variants.uniq],2))]
        res = parallel_lods(gene_fam_haps.values(),rho)
        smy_res = sum_variant_lods(res)
        with open(file+'cutoff'+str(cutoff)+'_rho'+str(rho)+'.result','wb') as handle:
            pickle.dump(smy_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

    fam17 = pd.read_csv(f'${fam_path}',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
    fam17.index = list(fam17.iid)
    fam17.ad[fam17.ad==-9]=0
    fam17_d = {}
    cutoff=0.05
    for i in fam17.fid.unique():
        fam17_d[i] = fam17[fam17.fid==i]
    inputs=glob.glob(f'${_input[0]}/chr${_chrom}test/tmp/CACHE/chr${_chrom}test*.pickle')
    for i in inputs:
        file=i[:-7]
        with open(file+'.pickle', 'rb') as handle:
            genes = pickle.load(handle)
        gene_variants,gene_fam_haps = format_haps_bunch(genes,fam17_d)
        if cutoff is not None:
            for f,variants in gene_variants.items():
                gene_fam_haps[f]=gene_fam_haps[f].loc[:,[True]*6+list(np.repeat((variants.freqs>cutoff)[variants.uniq],2))]
        for r in np.arange(0.0,0.5,0.05):
            print(i[:-7],r)
            file = i[:-7]+'cutoff'+str(0.05)+'_rho'+str(r)+'.result'
            if os.path.isfile(file):
                print('exist! jump',file)
            else:
                res = parallel_lods(gene_fam_haps.values(),r)
                smy_res = sum_variant_lods(res)
                with open(file+'cutoff'+str(cutoff)+'_rho'+str(r)+'.result','wb') as handle:
                    pickle.dump(smy_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
[makehap (create hap)]
parameter: fam_vcf = path()
input: cwd, fam_path, for_each = 'chrom'
output: f'{cwd:a}/chr{_chrom}test/tmp/CACHE/chr{_chrom}test'
task: walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
python: expand = '${ }', stderr = f'{_output:n}makehap.stderr', stdout = f'{_output:n}makehap.stdout'

    import sys
    import os.path
    import glob
    import pandas as pd
    import numpy as np
    import pickle
    from SEQLinkage.linkage import *
    
    def format_haps_by_genes(file,fam,cutoff=None):
        with open(file+'.pickle', 'rb') as handle:
            genes = pickle.load(handle)
        gene_variants,gene_fam_haps = format_haps_bunch(genes,fam)
        if cutoff is not None:
            for f,variants in gene_variants.items():
                gene_fam_haps[f]=gene_fam_haps[f].loc[:,[True]*6+list(np.repeat((variants.freqs>cutoff)[variants.uniq],2))]
                variants=variants[variants.freqs>cutoff]
        with open(file+'cutoff'+str(cutoff)+'.input','wb') as handle:
            pickle.dump([gene_variants,gene_fam_haps], handle, protocol=pickle.HIGHEST_PROTOCOL)

    if os.path.isfile(f'${fam_vcf}'):
        with open(f'${fam_vcf}', 'rb') as handle:
            fam17_vcf = pickle.load(handle)
    fam17 = pd.read_csv(f'${fam_path}',delim_whitespace=True,header=None,names=['fid','iid','fathid','mothid','sex','ad'])
    fam17.index = list(fam17.iid)
    fam17.ad[fam17.ad==-9]=0
    fam17_d = {}
    cutoff=0.05
    for i in fam17.fid.unique():
        fam17_d[i] = fam17[fam17.fid==i]
    inputs=glob.glob(f'${_input[0]}/chr${_chrom}test/tmp/CACHE/chr${_chrom}test*.pickle')
    for i in inputs:
        file=i[:-7]
        output=file+'cutoff'+str(cutoff)+'.input'
        if os.path.isfile(output):
            print('exist! jump',output,file=sys.stderr)
        else:
            print('create input',output,file=sys.stdout)
            format_haps_by_genes(file,fam17_d,cutoff)

In [None]:
[lods (calculate lods)]
parameter: fam_vcf = path()
input: cwd, fam_path, for_each = 'chrom'
output: f'{cwd:a}/chr{_chrom}test/tmp/CACHE/chr{_chrom}test'
task: walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
python: expand = '${ }', stderr = f'{_output:n}lods.stderr', stdout = f'{_output:n}lods.stdout'
    
    import sys
    import os.path
    import glob
    import pandas as pd
    import numpy as np
    import pickle
    from SEQLinkage.linkage import *
    
    if os.path.isfile(f'${fam_vcf}'):
        with open(f'${fam_vcf}', 'rb') as handle:
            fam17_vcf = pickle.load(handle)
    inputs=glob.glob(f'${_input[0]}/chr${_chrom}test/tmp/CACHE/chr${_chrom}test*cutoff0.05.input')
    for file in inputs:
        if os.path.isfile(f'${fam_vcf}'):
            output_file=file[:-6]+'unimputed.lods'
        else:
            output_file=file[:-6]+'.lods'
        if os.path.isfile(output_file):
            print('exist! jump',output_file,file=sys.stderr)
        else:
            print('create input',output_file,file=sys.stdout)
            with open(file, 'rb') as handle:
                gene_variants,gene_fam_haps = pickle.load(handle)
            if os.path.isfile(f'${fam_vcf}'):
                for k,hap in gene_fam_haps.items():
                    hap.loc[~fam17_vcf[k],[False]*6+[True]*(hap.shape[1]-6)]=0 
                    variants=gene_variants[k]
                    gene_variants[k]=variants[variants.freqs>0.05]
            #remove variants only have 1 or 2
            for k in gene_fam_haps.keys():
                
            res = parallel_lods(gene_fam_haps.values(),np.arange(0,0.5,0.05))
            with open(output_file,'wb') as handle:
                pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

 trunk_workers = 10, trunk_size=10, walltime = walltime, mem = mem, cores = 10, 

import pandas as pd
import pickle

for i in range(1,23):
    print(i)
    anno = pd.read_csv('../MWE/annotation/EFIGA_NIALOAD_chr'+str(i)+'.hg38.hg38_multianno.csv')
    af = anno.AF
    af = af.replace('.',0)
    af = af.astype(float)
    with open('../data/wg20220316/chr'+str(i)+'_common_variants.pickle','wb') as handle:
        pickle.dump(anno[af>0.05].Otherinfo1, handle, protocol=pickle.HIGHEST_PROTOCOL)

sos run nbs/seqlink_sos.ipynb linkage --cwd data/wg20220316 --fam_path data/new_trim_ped_famless17_no:xx.fam --chrom 11 12 13 14 15 16 17 18 19 -j 9

qsub sos run nbs/seqlink_sos.ipynb linkage --cwd data/wg20220316 --fam_path data/new_trim_ped_famless17_no:xx.fam --chrom 1 2 3 4 5 6 7 8 9 10 -j node1:10 node2:10 node3:10 node4:10 node5:10 node6:10 node7:10 node8:10 node9:10 node10:10