# __STRATEGIE D'ANALYSE BIOINFORMATIQUE - MENTORING PROJECT__

__AGMIMONHAN Attolou Raoul, NAME Pakyendou Estel__

__Tuteurs: Aurore COMTE & Sebastien RAVEL__

Jupyter inspired by the model created by C. Tranchant (DIADE-IRD), J. Orjuela (DIADE-IRD), F. Sabot (DIADE-IRD) and A. Dereeper (PHIM-IRD)
***

# <span style="color: #006E7F">Table of contents</span>
<a class="anchor" id="home"></a>


[PRACTICE III - Mapping sur tous les échantillons](#mapping) 
   * [Reference indexation  `bwa-mem2 index`](#refindex)
   * [Mapping avec `bwa-mem2 mem`](#bwamem2-cmd)
   * [Convertir sam into bam `samtools view`](#samtoolsview)
   * [Etablir les statistiques du mapping `samtools flagstat`](#flagstats)
   * [Filtrer les reads correctement mappés `samtools view`](#corrmap)
   * [Faire le sort des reads filtrés `samtools sort`](#sort)
   * [Indexation des fichiers bam sorted  `samtools index`](#indexbam) 
   

***

# __Practice III - Mapping sur tous les échantillons__

## __III.1. Téléchargement & Indexation du génome de référence__

In [None]:
## Se déplacer dans le répertoire SCRIPTS

cd /scratch/MOryzae/SCRIPTS

In [None]:
## Ouvrir l'éditeur de texte nano

nano Refseq.sh

In [None]:
#!/bin/bash

############# SLURM Configuration ##############

### Define Job name
#SBATCH --job-name=genome_download_index

### Define partition to use
#SBATCH -p normal

### Define number of CPUs to use
#SBATCH -c 8

### Specify the node to run on
#SBATCH --nodelist=node20  # Spécifie que le job doit être exécuté sur node20

#################################################

########### Execution Command ###################

# Créer le répertoire REF si nécessaire
REF_DIR="/scratch/MOryzae/REF"
mkdir -p "$REF_DIR"

# Définir le chemin du fichier de référence
GENOME_URL="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/495/GCF_000002495.2_MG8/GCF_000002495.2_MG8_genomic.fna.gz"
GENOME_FILE="$REF_DIR/MOryzae_genomic.fna.gz"

# Télécharger le génome de référence
echo "Téléchargement du génome de référence..."
wget -O "$GENOME_FILE" "$GENOME_URL"

# Vérifier si le téléchargement a réussi
if [[ $? -ne 0 ]]; then
    echo "Erreur : Le téléchargement du génome a échoué."
    exit 1
fi

# Décompresser le fichier
echo "Décompression du fichier..."
gunzip "$GENOME_FILE"

# Modifier le nom pour une utilisation plus facile
mv "${GENOME_FILE%.gz}" "$REF_DIR/MOryzae_genomic.fna"

# Charger le module bwa-mem2 pour faire l'indexation
module load bwamem2/2.2.1

# Indexer le génome de référence
echo "Indexation du génome de référence..."
bwa-mem2 index "$REF_DIR/MOryzae_genomic.fna"

# Vérifier si l'indexation a réussi
if [[ $? -eq 0 ]]; then
    echo "Indexation du génome de référence réussie."
else
    echo "Erreur : L'indexation du génome a échoué."
fi


In [None]:
## Lancer le script

sbash refseq.sh

In [None]:
## Vérifier les fichiers index générés et leurs extensions

ls -lh /scratch/MOryzae/REF/

## __III.2. Mapping__

In [None]:
## Créer le répertoire MAPPING dans le répertoire de travail

mkdir -p /scratch/MOryzae/MAPPING


In [None]:
## Ouvrir l'éditeur de texte nano

nano mapping_pipeline.sh

In [None]:
#!/bin/bash

############# SLURM Configuration ##############

### Set the job name
#SBATCH --job-name=mapping_pipeline

### Set the partition to use
#SBATCH -p normal

### Set the number of CPUs to use
#SBATCH -c 8

### Specify the node on which the job should run
#SBATCH --nodelist=node20  # Specifies that the job should run on node20

#################################################

########### Path Variables ##################

# Define paths for data and output directories
REF_PATH="/scratch/MOryzae/REF/MOryzae_genomic.fna"
TRIM_DATA_PATH="/scratch/MOryzae/DATA/Trimming"
OUTPUT_PATH="/scratch/MOryzae/MAPPING"
SAM_PATH="${OUTPUT_PATH}/sam_files"
BAM_PATH="${OUTPUT_PATH}/bam_raw"
STATS_PATH="${OUTPUT_PATH}/bam_stats"
FILTERED_PATH="${OUTPUT_PATH}/bam_filtered"
SORTED_PATH="${OUTPUT_PATH}/bam_mapped_sort"

# Load bwa-mem2 and samtools modules
module load bwamem2/2.2.1
module load samtools/1.18

# List of sequences
sequences=("AG0004" "BN0123" "CH0461" "G22" "IE1K" "IR0015" "ML33" "PH42" "TN0057"
           "Arcadia" "BN0202" "CH0533" "GFSI1-7-2" "IN0017" "IR0083" "NG0012" "PL2-1" "TN0065"
           "B2" "BN0252" "CH1103" "GG11" "IN0054" "IR0084" "NG0054" "SSFL02" "TN0090"
           "B71" "Br7" "CH1164" "GN0001" "IN0059" "IR0088" "NP0058" "SSFL14-3" "TR0025"
           "Bd8401" "Br80" "CHRF" "GY0040" "IN0114" "IR0095" "P28" "T25" "US0041"
           "BdBar" "CD0065" "CHW" "HO" "IN0115" "IT0010" "P29" "TG0004" "US0064"
           "BF0072" "CD0142" "CM0028" "IA1" "IN0116" "JP0091" "P3" "TG0032" "VT0027"
           "Bm88324" "CH0043" "FR1067" "IB33" "INA168" "LpKY-97-1" "Pg1213-22" "TN0001" "VT0030"
           "BN0019" "CH0072" "FR1069" "IB49" "IR00102" "ML0060" "PgKY4OV2-1" "TN0002" "Z2-1"
           "BN0119" "CH0452" "G17" "IC17" "IR0013" "ML0062" "PgPA18C-02" "TN0050")

#################################################

########### Create output directories ##################

# Create the output directories if they do not exist
mkdir -p "$SAM_PATH" "$BAM_PATH" "$STATS_PATH" "$FILTERED_PATH" "$SORTED_PATH"

########### Step 1: Mapping, SAM to BAM conversion, Statistics, Filtering, Sorting ##################


# Loop over each sequence to perform all steps
for sequence in "${sequences[@]}"; do
    echo -e "######################\nProcessing for ${sequence}..."
    
    # Define file paths for input and output
    R1="${TRIM_DATA_PATH}/${sequence}_R1_paired.fastq.gz"
    R2="${TRIM_DATA_PATH}/${sequence}_R2_paired.fastq.gz"
    SAM_FILE="${SAM_PATH}/${sequence}.sam"
    BAM_FILE="${BAM_PATH}/${sequence}.bam"
    FLAGSTAT_FILE="${STATS_PATH}/${sequence}.flagstat"
    FILTERED_BAM="${FILTERED_PATH}/${sequence}.mappedpaired.bam"
    SORTED_BAM="${SORTED_PATH}/${sequence}.mappedpaired.sorted.bam"
    
    # Step 1: Mapping with bwa-mem2
    bwa-mem2 mem -t 8 "$REF_PATH" "$R1" "$R2" -o "$SAM_FILE"
    echo "Mapping completed for ${sequence}"
    
    # Step 2: Convert SAM to BAM
    samtools view -b -o "$BAM_FILE" "$SAM_FILE"
    echo "SAM to BAM conversion successful for ${sequence}"
    
    # Step 3: Generate statistics using flagstat
    samtools flagstat -@ 8 "$BAM_FILE" > "$FLAGSTAT_FILE"
    echo "Statistics generated for ${sequence}: ${FLAGSTAT_FILE}"

    # Step 4: Filter BAM files
    samtools view -bh -@ 8 -f 0x02 -o "$FILTERED_BAM" "$BAM_FILE"
    echo "Filtered BAM created for ${sequence}: ${FILTERED_BAM}"
    
    # Step 5: Sort the filtered BAM files
    samtools sort -@ 8 "$FILTERED_BAM" -o "$SORTED_BAM"
    echo "Sorted BAM created for ${sequence}: ${SORTED_BAM}"
    
    # Step 6: Index the sorted BAM file
    samtools index "$SORTED_BAM"
    echo "Indexing completed for ${sequence}"
    
done


In [None]:
## Lancer le script

sbash mapping_pipeline.sh

In [None]:
## Grouper le fichiers flagstaten un seul fichier csv

In [None]:
## Ouvrir l'éditeur de texte nano

nano flagstat.sh

In [None]:
#!/bin/bash

############# SLURM Configuration ##############

### Set the job name
#SBATCH --job-name=concatenate_flagstat

### Set the partition to use
#SBATCH -p normal

### Set the number of CPUs to use
#SBATCH -c 8

### Specify the node on which the job should run
#SBATCH --nodelist=node20  # Specifies that the job should run on node20

#################################################

# Path variables
flagstat_dir="/scratch/MOryzae/MAPPING/bam_stats"  # Directory containing flagstat files
stat_file="/scratch/MOryzae/MAPPING/bam_stats/all_stat.csv"  # Output file

# List of prefix
prefix=("AG0004" "BN0123" "CH0461" "G22" "IE1K" "IR0015" "ML33" "PH42" "TN0057"
           "Arcadia" "BN0202" "CH0533" "GFSI1-7-2" "IN0017" "IR0083" "NG0012" "PL2-1" "TN0065"
           "B2" "BN0252" "CH1103" "GG11" "IN0054" "IR0084" "NG0054" "SSFL02" "TN0090"
           "B71" "Br7" "CH1164" "GN0001" "IN0059" "IR0088" "NP0058" "SSFL14-3" "TR0025"
           "Bd8401" "Br80" "CHRF" "GY0040" "IN0114" "IR0095" "P28" "T25" "US0041"
           "BdBar" "CD0065" "CHW" "HO" "IN0115" "IT0010" "P29" "TG0004" "US0064"
           "BF0072" "CD0142" "CM0028" "IA1" "IN0116" "JP0091" "P3" "TG0032" "VT0027"
           "Bm88324" "CH0043" "FR1067" "IB33" "INA168" "LpKY-97-1" "Pg1213-22" "TN0001" "VT0030"
           "BN0019" "CH0072" "FR1069" "IB49" "IR00102" "ML0060" "PgKY4OV2-1" "TN0002" "Z2-1"
           "BN0119" "CH0452" "G17" "IC17" "IR0013" "ML0062" "PgPA18C-02" "TN0050")

# Create the output file and write the headers
echo "Sequence,total,primary,secondary,supplementary,duplicates,primary_duplicates,mapped,primary_mapped,paired_in_sequencing,read1,read2,properly_paired,singletons,unmapped" > "$stat_file"

# Loop through each sequence
for file in "${prefix[@]}"; do
    # Define the corresponding flagstat file for the sequence
    flagstat_file="${flagstat_dir}/${file}.flagstat"

    # Check if the file exists
    if [[ -f "$flagstat_file" ]]; then
        # Initialize variables to store values from the flagstat file
        total=0
        primary=0
        secondary=0
        supplementary=0
        duplicates=0
        primary_duplicates=0
        mapped=0
        primary_mapped=0
        paired_in_sequencing=0
        read1=0
        read2=0
        properly_paired=0
        singletons=0
        unmapped=0

        # Read the flagstat file line by line
        while IFS= read -r line; do
            # Extract values based on the specific keywords
            if [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ in\ total ]]; then
                total="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ primary ]]; then
                primary="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ secondary ]]; then
                secondary="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ supplementary ]]; then
                supplementary="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ duplicates ]]; then
                duplicates="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ primary\ duplicates ]]; then
                primary_duplicates="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ mapped ]]; then
                mapped="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ primary\ mapped ]]; then
                primary_mapped="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ paired\ in\ sequencing ]]; then
                paired_in_sequencing="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ read1 ]]; then
                read1="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ read2 ]]; then
                read2="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ properly\ paired ]]; then
                properly_paired="${BASH_REMATCH[1]}"
            elif [[ $line =~ ^([0-9]+)\ \+\ [0-9]+\ singletons ]]; then
                singletons="${BASH_REMATCH[1]}"
            fi
        done < "$flagstat_file"

        # Calculate unmapped reads
        unmapped=$((total - mapped))

        # Write the results to the CSV file
        echo "$file,$total,$primary,$secondary,$supplementary,$duplicates,$primary_duplicates,$mapped,$primary_mapped,$paired_in_sequencing,$read1,$read2,$properly_paired,$singletons,$unmapped" >> "$stat_file"
    else
        echo "Warning: File for sequence $seq not found. Skipping."
    fi
done

echo "Data extraction complete. Results saved in $stat_file."


In [None]:
## Supprimer les sous répertoires sam_files, bam_raw, bam_filtered dans le répertoire MAPPING afin de libérer plus d'espaces

rm -rf /scratch/MOryzae/MAPPING/sam_files
rm -rf /scratch/MOryzae/MAPPING/bam_raw
rm -rf /scratch/MOryzae/MAPPING/bam_filtered

In [None]:
### Récupérer tous les outputs générés dans le répertoire MAPPING sur le NAS

In [None]:
## Déplacer tout le répertoire MAPPING vers le NAS

scp -r /scratch/MOryzae/MAPPING san:/projects/medium/CIBiG_MOryzae/