In [19]:
# Alignments for various species to reference genomes

# loading modules
module load bowtie2
module load minimap2 
module load cd-hit

# checking active 
# module list

Currently Loaded Modulefiles:
 1) [46mpowerPlant/core[0m    5) [46mSlurm/21.08.8-2[0m      9) bowtie2/2.3.4.3  
 2) texlive/20230914   6) perlbrew/0.76       10) minimap2/2.22    
 3) pandoc/1.19.2      7) perl/5.36.0         11) cd-hit/4.6.1     
 4) git/2.21.0         8) slurm-utils/latest  

Key:
[46msticky[0m  


In [41]:
# setting up variables 
# paths
path_raw=/workspace/cfngle/raw-data/
path_results=/workspace/cfngle/results-data/

# filenames
# 200 bp sequences
seq_200bp=("AC_CpG_200bp.fasta" "AS_CpG_200bp.fasta" "EH_CpG_200bp.fasta")

seq_100bp=("AC_CpG_100bp.fasta" "AS_CpG_100bp.fasta" "EH_CpG_100bp.fasta")

: << 'COMMENT'
n=${#seq_200bp[@]}
for i in ${seq_200bp[@]}; do
    echo $i
    echo $n
    echo "${seq_200bp[n]}"
    n=$((n-1))
done
COMMENT

In [40]:

#### CD-HIT ####
# clustering sequences with cd-hit
cdhit_opt="-p 1 -c 0.75 -n 4 -T 0 -M 0"
cdhit_nmsfx="_75"

# AC
cd-hit-est-2d -i ${path_results}${seq_200bp[0]} -i2 ${path_results}${seq_200bp[1]} -o ${path_results}cd-hit/AC_AS_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt
cd-hit-est-2d -i ${path_results}${seq_200bp[0]} -i2 ${path_results}${seq_200bp[2]} -o ${path_results}cd-hit/AC_EH_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt
# AS
cd-hit-est-2d -i ${path_results}${seq_200bp[1]} -i2 ${path_results}${seq_200bp[2]} -o ${path_results}cd-hit/AS_EH_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt
cd-hit-est-2d -i ${path_results}${seq_200bp[1]} -i2 ${path_results}${seq_200bp[0]} -o ${path_results}cd-hit/AS_AC_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt
# EH
cd-hit-est-2d -i ${path_results}${seq_200bp[2]} -i2 ${path_results}${seq_200bp[0]} -o ${path_results}cd-hit/EH_AC_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt
cd-hit-est-2d -i ${path_results}${seq_200bp[2]} -i2 ${path_results}${seq_200bp[1]} -o ${path_results}cd-hit/EH_AS_cd-hit${cdhit_nmsfx}.fasta $cdhit_opt


# loop test
: << 'COMMENT'
for i in $seq_200bp; do
    cd-hit-est-2d -i ${path_results}$i -i2 ${path_results}AC -o ${path_results}cd-hit/${i}cd-hit_AC.fasta -c 0.9 -p 1
done
COMMENT

Program: CD-HIT, V4.6 (+OpenMP), Jun 26 2013, 16:45:20
Command: cd-hit-est-2d -i
         /workspace/cfngle/results-data/AC_CpG_200bp.fasta -i2
         /workspace/cfngle/results-data/AS_CpG_200bp.fasta -o
         /workspace/cfngle/results-data/cd-hit/AC_AS_cd-hit_85.fasta
         -p 1 -c 0.85 -n 6 -T 8 -M 8000

Started: Fri Feb  9 13:09:47 2024
                            Output                              
----------------------------------------------------------------
total seq in db1: 13755
total seq in db2: 24535
longest and shortest : 635 and 200
Total letters: 3867239
Sequences have been sorted
longest and shortest : 579 and 200
Total letters: 7326339

Approximated minimal memory consumption:
Sequence        : 14M
Buffer          : 8 X 12M = 98M
Table           : 2 X 0M = 0M
Miscellaneous   : 0M
Total           : 114M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 985739687

..........    10000  finished


: 127

In [None]:
#### BOWTIE2 ##### 
#for i in 
#bowtie2 -x ${path_raw}ZF/rgenome/bowtie2-index/ZF_bowtie2 -f -U ${path_results}EH_CpG_200bp.fasta -S ${path_results}ZF_EH_200_bt2_N_1.sam -N 1

In [42]:
#### MINIMAP2 ####

#AC
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_200bp[0]} > ${path_results}minimap2/ZF_AC_200_minimap.sam
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_100bp[0]} > ${path_results}minimap2/ZF_AC_100_minimap.sam
#AS
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_200bp[1]} > ${path_results}minimap2/ZF_AS_200_minimap.sam
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_100bp[1]} > ${path_results}minimap2/ZF_AS_100_minimap.sam
#EH
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_200bp[2]} > ${path_results}minimap2/ZF_EH_200_minimap.sam 
minimap2 -ax map-ont ${path_raw}ZF/rgenome/GCF_000002035.6_GRCz11_genomic.fna ${path_results}${seq_100bp[2]} > ${path_results}minimap2/ZF_EH_100_minimap.sam 





In [None]:
## view how many alignments alignments

#AC
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_AC_200_minimap.sam
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_AC_100_minimap.sam
#AS
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_AS_200_minimap.sam
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_AS_100_minimap.sam
#EH
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_EH_200_minimap.sam
view -c -F 4 /workspace/cfngle/results-data/minimap2/ZF_EH_100_minimap.sam