# Genotype alignment

## example

In [None]:
sos run pipeline/genotype_alignment genotype_alignment \
    --geno_list_paths ROSMAP/genotype/analysis_ready/geno_by_chrom \
                      MSBB/genotype/analysis_ready/genotype_per_chrom \
                      MIGA/genotype/analysis_ready/geno_per_chrom \
                      Knight/genotype/analysis_ready/geno_per_chrom \                      
                      STARNET/genotype/analysis_ready/geno_per_chrom  \
    -c ~/env_files/csg.yml -q csg 

can use below function to align provided variants to geno ref
```
#from pecotmr
tabix_region <- function(file, region, tabix_header = "auto") {
  # Execute tabix command and capture the output
  cmd_output <- tryCatch(
    {
      fread(cmd = paste0("/mnt/vast/hpc/homes/rf2872/software/htslib-1.9/tabix -h ", file, " ", region), sep = "auto", header = tabix_header)
    },
    error = function(e) NULL
  )

  # Check if the output is empty and return an empty tibble if so
  if (is.null(cmd_output) || nrow(cmd_output) == 0) {
    return(tibble())
  }
  cmd_output %>%
    as_tibble() %>%
    mutate(
      !!names(.)[1] := as.character(.[[1]]),
      !!names(.)[2] := as.numeric(.[[2]]) 
    )
}


align_to_genoref <- function(var_list, geno_ref, region ){
    geno_ref <- tabix_region(file= geno_ref,
                region = region)    
    colnames(geno_ref) <- c('chr', 'pos', 'alt', 'ref')
    geno_ref <- geno_ref %>% mutate(chr = gsub('chr','',chr))
    var_list_df <- data.frame(chr = str_split(var_list,":|_",simplify = T)[,1] %>% gsub('chr','',.),
        pos = str_split(var_list,":|_",simplify = T)[,2],
        ref = str_split(var_list,":|_",simplify = T)[,3],
        alt = str_split(var_list,":|_",simplify = T)[,4])
    # merge_genotype_data from below cell
    aligned_var_df <- merge_genotype_data(geno_ref, var_list_df, all=FALSE)
    aligned_var <- aligned_var_df %>%
      mutate(id = {
        if (grepl(":", var_list[1])) {
          if (grepl("_", var_list[1])) {
            paste(chr, paste(pos, ref, alt, sep = "_"),sep = ':')
          } else {
            paste(chr, pos, ref, alt, sep = ":")
          }
        } else {
          paste(chr, pos, ref, alt, sep = "_")
        }
      }) %>%
      pull(id)
    if (grepl("chr", var_list[1]))  aligned_var <- paste0("chr",aligned_var)
    return(aligned_var)
}
```

In [None]:
[global]
import glob
import pandas as pd
## Path to work directory where output locates
parameter: cwd = path("output")
parameter: name = "demo"
## Containers that contains the necessary packages
parameter: container = ""
import re
parameter: entrypoint= ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
# For cluster jobs, number commands to run per job
parameter: job_size = 50
# Wall clock time expected
parameter: walltime = "96h"
# Memory expected
parameter: mem = "6G"
# Number of threads
parameter: numThreads = 2
parameter: windows = 1000000
# use this function to edit memory string for PLINK input
from sos.utils import expand_size

In [1]:
#align different cohort's genotype file to ROSMAP's. chr11 on ROSMAP and MIGA costed 4.5h
[genotype_alignment]
# Input
# A list of folder paths with bim files, with orders, rosmap should be first, and then maybe mssb. 
parameter: geno_list_paths = []
# a function to split the genofiles by chrom
import pandas as pd
import os
import re

def group_by_region(lst, partition):
    return partition

bim_files = []

for path in geno_list_paths:
    for root, dirs, files in os.walk(path):
        for file in files:
            if re.search(r'\d+\.bim$', file):
                bim_files.append(os.path.join(root, file))

data = []

for path in bim_files:
    basename = os.path.basename(path)
    chrom = basename.split('.')[-2]
    data.append({'geno_list': path, 'chr': chrom})

region = pd.DataFrame(data)
region = region.groupby('chr')['geno_list'].apply(lambda x: x.tolist()).reset_index()

regional_data = {
    'geno_list': [row['geno_list'] for _, row in region.iterrows()],
    'chr': [row['chr'] for _, row in region.iterrows()]
}

chr_info = regional_data['chr']

input: regional_data["geno_list"], group_by = lambda x: group_by_region(x, regional_data["geno_list"]), group_with = "chr_info"
# a funtion to get geno prefixs as genos

geno_list = regional_data['geno_list'][0]
genos = '.'.join([os.path.basename(path).split('.')[0] for path in geno_list])
# chrom = os.path.basename(geno_list[0]).split('.')[-2]

output: f"{cwd}/{name}.{genos}.{_chr_info}.aligned.bim.gz", f"{cwd}/{name}.{genos}.{_chr_info}.aligned.bim.gz.tbi"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container, entrypoint = entrypoint
    library(tidyverse) 
    library(data.table)
  
    merge_genotype_data <- function(df1, df2, all = TRUE) {
      setDT(df1)
      setDT(df2)
      df1[, key := paste(chr, pos, pmin(alt, ref), pmax(alt, ref))]
      df2[, key := paste(chr, pos, pmin(alt, ref), pmax(alt, ref))]
      df2[df1, on = "key", flip := i.alt == ref & i.ref == alt, by = .EACHI]
      df2[flip == TRUE, c("alt", "ref") := .(ref, alt)]
      if (all) {
        df_combined <- unique(rbindlist(list(df1[, .(chr, pos, alt, ref)], df2[, .(chr, pos, alt, ref)])), by = c("chr", "pos", "alt", "ref"))
      } else {
        df_combined <- df2[, .(chr, pos, alt, ref)]
      }
      return(df_combined)
    }

   
    bim_files = c(${",".join(['"%s"' % x.absolute() for x in _input])})
    rosmap_bim <- fread(bim_files[1], header = FALSE) 
    colnames(rosmap_bim) <- c('chr', 'id', 'Mics', 'pos', 'alt', 'ref')

    for(i in 2:length(bim_files)){
      message('Aligning ', bim_files[i])
      tmp_bim <- fread(bim_files[i], header = FALSE)
      colnames(tmp_bim) <- c('chr', 'id', 'Mics', 'pos', 'alt', 'ref')
      rosmap_bim <- merge_genotype_data(rosmap_bim, tmp_bim)
    }
    fwrite(rosmap_bim, ${_output[0]:nr})
    system("bgzip ROSMAP_NIA_WGS.leftnorm.bcftools_qc.plink_qc.11.bim")
    system("tabix -s 1 -b 4 -e 4 ROSMAP_NIA_WGS.leftnorm.bcftools_qc.plink_qc.11.bim.gz")