In [None]:
# ## A02_demultiplexing overall commands

# qsub Scripts/A02a_demultiplex_fastq.sub # *
# qsub Scripts/A02b_check_demultip.sub # ‡
# qsub Scripts/A02c_fastqc_demultip_fastq.sub

# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)
# # ‡ fast enough to run interactively

## (A02a) demultiplex .fastq.gz files

In [None]:
%%bash
cat > ../Scripts/A02a_demultiplex_fastq.pl

#!/usr/bin/perl -w
use strict;

# A02a_demultiplex_fastq.pl, v0.1 ==============================================
# perl script originally written by Dr. Chongyuan Luo (@luogenomics)
# minor modifications for readability/documentation by Choo Liu (@chooliu) 
# inputs: - R1 and R2 .fastq.gz file in 'fastq_raw'
#           containing all cells, first 8bp of Read 1 assumed to be cell barcode
#         - index .fasta (> 384-well plate location / cell barcode sequence)
# outputs: - R1 and R2 .fastq.gz for each cell
#          - summary .txt showing # of reads per cell barcode
# typical usage: perl A02a_demultiplex_fastq.pl r1.fq.gz r2.fq.gz index.fa
# ==============================================================================

#!/usr/bin/perl -w
use strict;

my %index_seq; my %index; my $index_seq; my $index_str; my @index=();
open index_file, "$ARGV[2]" or die $!;
while (<index_file>)
{
  chop $_; $index_str=substr($_,1,length($_)-1);
  $_=<index_file>; 
  chop $_; $index_seq{substr($_,1,length($_)-1)}=$index_str; $index{$index_str}=substr($_,1,length($_)-1);
  push (@index,$index_str);
}	
close index_file;

my $l_fastq=@ARGV;
my @r1_list; my @r2_list; my @r1_name; my @r2_name; my $r1_name; my $r2_name;
if ($l_fastq==2) { push (@r1_list,$ARGV[0]); }
elsif ($l_fastq==3) { push (@r1_list,$ARGV[0]); push (@r2_list,$ARGV[1]); }


my $count; my %index_count; my $eight; my @r1_tmp; my @r2_tmp; 
my $r1; my $r2; my $percentage; my $command; my $return;


for (my $sample=0; $sample<=@r1_list-1; $sample++)
{
  if (@r2_list>0)
  {
    @r1_name=split(/_R1/,$r1_list[$sample]);
    @r2_name=split(/_R2/,$r2_list[$sample]);
    
    my %r1_out; my %r2_out;
    for (my $index=0; $index<=@index-1; $index++)
    {
      $r1_name[0] =~ s/fastq_raw/fastq_demultip/ig;
      $r2_name[0] =~ s/fastq_raw/fastq_demultip/ig;
      local *FILE;
      open FILE, " | gzip -c > $r1_name[0]\_$index[$index]\_indexed_R1$r1_name[1]" or die $!;
      $r1_out{$index[$index]}=*FILE;
      local *FILE;
      open FILE, " | gzip -c > $r2_name[0]\_$index[$index]\_indexed_R2$r2_name[1]" or die $!;
      $r2_out{$index[$index]}=*FILE;
    }	
    
    my $count; my %index_count; my $six;
    open r1_fastq_in, "gzip -cd $r1_list[$sample] | " or die $!;
    open r2_fastq_in, "gzip -cd $r2_list[$sample] | " or die $!;
    $count=0; @r1_tmp=(); @r2_tmp=();
    while (<r1_fastq_in>)
    {
      chop $_; $count++;
      $r1=$_; $r2=<r2_fastq_in>; chop $r2;
      push (@r1_tmp, $r1); push (@r2_tmp,$r2);
      if (!($count%4))
      {
        $eight=substr($r1_tmp[1],0,8);
        if (exists $index_seq{$eight})
        {
          if (!exists $index_count{$index_seq{$eight}}) { $index_count{$index_seq{$eight}}=0; }
          $index_count{$index_seq{$eight}}++;
          print {$r1_out{$index_seq{$eight}}} "$r1_tmp[0]\n$r1_tmp[1]\n$r1_tmp[2]\n$r1_tmp[3]\n";
          print {$r2_out{$index_seq{$eight}}} "$r2_tmp[0]\n$r2_tmp[1]\n$r2_tmp[2]\n$r2_tmp[3]\n";
        }
        else
        {
          if (!exists $index_count{0}) { $index_count{0}=0; }
          $index_count{0}++;
        }
        @r1_tmp=(); @r2_tmp=();
      }
    }
    for (my $index=0; $index<=@index-1; $index++)
    {
      close $r1_out{$index[$index]};
      close $r2_out{$index[$index]};
      if (!exists $index_count{$index[$index]}) 
      {
        $command="rm $r1_name[0]\_$index[$index]\_indexed_R1$r1_name[1]";
        print "$command\n"; $return=system($command);
        $command="rm $r2_name[0]\_$index[$index]\_indexed_R2$r2_name[1]";
        print "$command\n"; $return=system($command);
      }
    }
    close r1_fastq_in;
    close r2_fastq_in;
    my $indexout = substr($ARGV[2], -4, 1);
    open summary_out, ">$r1_name[0]\_summary\_$indexout.txt" or die $!;
    $count=int($count/4);
    print summary_out "total reads - $count\n";
    for (sort {$a cmp $b} keys %index_count) 
    {
      $percentage=sprintf('%.1f', 100*($index_count{$_}/$count)); $percentage="$percentage%";
      if (!$_) { print summary_out "undetermined index\t\t$index_count{$_}\t$percentage\n"; }
      else { print summary_out "index $_\t$index{$_}\t$index_count{$_}\t$percentage\n"; }
    }
    close summary_out;
  }
} 

In [None]:
%%bash
cat > ../Scripts/A02a_cellbarcodes_subset1.fa
>A1
^ACGATCAG
>A3
^TCGAGAGT
>A5
^CTAGCTCA
>A7
^ATCGTCTC
>A9
^TCGACAAG
>A11
^CCTTGGAA
>A13
^ATCATGCG
>A15
^TGTTCCGT
>A17
^ATTAGCCG
>A19
^CGATCGAT
>A21
^GATCTTGC
>A23
^AGGATAGC
>C1
^GTAGCGTA
>C3
^AGAGTCCA
>C5
^GCTACTCT
>C7
^CTCTGGAT
>C9
^AGATCGTC
>C11
^GCTCAGTT
>C13
^GTCCTAAG
>C15
^TATGGCAC
>C17
^TCGGATTC
>C19
^AACAGCGA
>C21
^CCAACGAA
>C23
^CAGTGCTT
>E1
^GATCAAGG
>E3
^TCTTCGAC
>E5
^ATCGTGGT
>E7
^CGGTAATC
>E9
^AGTTGTGC
>E11
^AATGACGC
>E13
^TACCGGAT
>E15
^TTGCAACG
>E17
^CACTTCAC
>E19
^TAGCCATG
>E21
^ACAGGCAT
>E23
^AGGTGTTG
>G1
^CAGTCACA
>G3
^TCGATGAC
>G5
^GAAGTGCT
>G7
^CTTCCTTC
>G9
^CGAACAAC
>G11
^AACAACCG
>G13
^ACCTCAGT
>G15
^CGTCTTCA
>G17
^TGCGTAAC
>G19
^AACACGCT
>G21
^ACTCGATC
>G23
^TGAGCTGT
>I1
^TACTGCTC
>I3
^GACGAACT
>I5
^CTTCGCAA
>I7
^ATGGCGAT
>I9
^ACATGCCA
>I11
^GTCAACAG
>I13
^GTGGTATG
>I15
^CCAACTTC
>I17
^GACGTCAT
>I19
^ACGTCCAA
>I21
^GATCCACT
>I23
^AGCCTATC
>K1
^AGCTACCA
>K3
^AGATTGCG
>K5
^CACACATC
>K7
^GAGCAATC
>K9
^ATAGAGCG
>K11
^GACCGATA
>K13
^CAGACGTT
>K15
^CTGAACGT
>K17
^TTGGACTG
>K19
^GTCTGCAA
>K21
^CCACATTG
>K23
^GATGGAGT
>M1
^AGGTCAAC
>M3
^TACACACG
>M5
^CAAGTCGT
>M7
^AGCTAGTG
>M9
^CTCCTAGT
>M11
^ACTCCTAC
>M13
^CAATCAGG
>M15
^TCGTGCAT
>M17
^TAACGTCG
>M19
^AAGGCGTA
>M21
^TCTTACGG
>M23
^CGTGTGAT
>O1
^AACAGGTG
>O3
^AGTCGAAG
>O5
^TGGAAGCA
>O7
^CTCGTTCT
>O9
^ACGAGAAC
>O11
^AAGCCTGA
>O13
^CTACAAGG
>O15
^CGATGTTC
>O17
^ACCGGTTA
>O19
^GAACGGTT
>O21
^CTGTACCA
>O23
^GCGCATAT
>A2
^TGATAGGC
>A4
^CATCCAAG
>A6
^GTGAGACT
>A8
^CTGATGAG
>A10
^ACGGTACA
>A12
^CTCGACTT
>A14
^ACAACGTG
>A16
^TGCTGTGA
>A18
^CCAAGTAG
>A20
^AACTGAGG
>A22
^AGGTAGGA
>A24
^TTCGCCAT
>C2
^CAGGTAAG
>C4
^GTATCGAG
>C6
^TTCACGGA
>C8
^GAGCTCTA
>C10
^GTCAGTCA
>C12
^CACGTCTA
>C14
^AATTCCGG
>C16
^TCTAGGAG
>C18
^ATCCGTTG
>C20
^GATAGCCA
>C22
^TATGACCG
>C24
^CGATTGGA
>E2
^ACAAGCTC
>E4
^GAACCTTC
>E6
^AGCGAGAT
>E8
^CCGTAACT
>E10
^TCAGACAC
>E12
^CGAAGTCA
>E14
^GTGATCCA
>E16
^ACTGGTGT
>E18
^CTAACCTG
>E20
^AGCCAACT
>E22
^CCAGTTGA
>E24
^AAGTGCAG
>G2
^AACCGTGT
>G4
^CGCGTATT
>G6
^AGTTCGCA
>G8
^TAGTCAGC
>G10
^AACACCAC
>G12
^GTAAGCAC
>G14
^GTCCTTGA
>G16
^CAGGTTCA
>G18
^CCAACACT
>G20
^GAGAGTAC
>G22
^AGATACGG
>G24
^GTTCTTCG
>I2
^ATTCCGCT
>I4
^AAGCTCAC
>I6
^TGATCACG
>I8
^CAATGCGA
>I10
^ATGCGTCA
>I12
^TACATCGG
>I14
^ACTGCGAA
>I16
^TCTGTCGT
>I18
^CTCAAGCT
>I20
^AACCACTC
>I22
^CTTACAGC
>I24
^AGTCTTGG
>K2
^CACGCAAT
>K4
^AGCTTCAG
>K6
^CCTCGTTA
>K8
^TGAGACGA
>K10
^CACAGGAA
>K12
^ACTCAACG
>K14
^AAGCGACT
>K16
^CCTACCTA
>K18
^ATCTCCTG
>K20
^TCACGATG
>K22
^CCACAACA
>K24
^AGGTCTGT
>M2
^AGAAGGAC
>M4
^GCGTATCA
>M6
^CAACACAG
>M8
^TCCACGTT
>M10
^ATCGCAAC
>M12
^ACGTCGTT
>M14
^CGAATACG
>M16
^TGCTTGCT
>M18
^CTCGAACA
>M20
^ACATGGAG
>M22
^ACAAGACG
>M24
^CGCCTTAT
>O2
^AGCAGACA
>O4
^GTTAAGCG
>O6
^CATGGATC
>O8
^ACAGAGGT
>O10
^TAAGTGGC
>O12
^AGTCAGGT
>O14
^GCCTTAAC
>O16
^GTTGGCAT
>O18
^CAACCTCT
>O20
^TGGATGGT
>O22
^CTATCCAC
>O24
^GATCTCAG

In [None]:
%%bash
cat > ../Scripts/A02a_cellbarcodes_subset2.fa
>B1
^GAACGAAG
>B3
^ACCTAGAC
>B5
^TACGACGT
>B7
^TTGAGCTC
>B9
^AGTACACG
>B11
^TGTCAGTG
>B13
^GACTACGA
>B15
^TTACGTGC
>B17
^ACTGCTTG
>B19
^GCCTATGT
>B21
^GTACCACA
>B23
^TAGTGGTG
>D1
^ATACGCAG
>D3
^AAGACCGT
>D5
^CTCCAATC
>D7
^TCTGGACA
>D9
^AACACTGG
>D11
^TTGGTGCA
>D13
^CCTGTCAA
>D15
^CTATGCCT
>D17
^TTCGGCTA
>D19
^ACCGACAA
>D21
^CGTAGATG
>D23
^CTGTATGC
>F1
^GTTGCTGT
>F3
^AGAACCAG
>F5
^GATGTCGA
>F7
^AGGAGGTT
>F9
^AATCGCTG
>F11
^AGTGACCT
>F13
^CGAATTGC
>F15
^CAAGAAGC
>F17
^CACCAGTT
>F19
^GTATTCCG
>F21
^TTCGAAGC
>F23
^AGACCTTG
>H1
^CCAAGGTT
>H3
^ACGTATGG
>H5
^AAGGACCA
>H7
^TATGCGGT
>H9
^AAGGAAGG
>H11
^AGCGTGTA
>H13
^TCTACGCA
>H15
^TGGCTCTT
>H17
^CCTTCCAT
>H19
^ATACTGGC
>H21
^AACCTACG
>H23
^CATACTCG
>J1
^TGCACTTG
>J3
^TCACTCGA
>J5
^CACTGTAG
>J7
^GTACGATC
>J9
^TGGTGAAG
>J11
^TAGCTGAG
>J13
^AGAGCAGA
>J15
^CTTCGGTT
>J17
^ACAACAGC
>J19
^AGCCGTAA
>J21
^CTCTTGTC
>J23
^CAGATCCT
>L1
^GATGCTAC
>L3
^AGGAACAC
>L5
^ACCATCCT
>L7
^GAACGTGA
>L9
^TAGAACGC
>L11
^AACCAGAG
>L13
^CGACCTAA
>L15
^CTCTCAGA
>L17
^AGGCTGAA
>L19
^ATCGGAGA
>L21
^GATACCTG
>L23
^TCCTGACT
>N1
^TCAGCCTT
>N3
^AAGCATCG
>N5
^GCCAATAC
>N7
^GACACAGT
>N9
^AAGAGGCA
>N11
^GAAGACTG
>N13
^CCGTTATG
>N15
^CTAGCAGT
>N17
^GCCAGAAT
>N19
^CGAGAGAA
>N21
^AACTCGGA
>N23
^ACAGTTCG
>P1
^TGACCGTT
>P3
^CATCTGCT
>P5
^CGCTGATA
>P7
^TCGTCTGA
>P9
^CACATGGT
>P11
^CGAGTTAG
>P13
^AGCTAAGC
>P15
^GTTCCATG
>P17
^GCATCCTA
>P19
^CCATGAAC
>P21
^ATCCACGA
>P23
^GAGAAGGT
>B2
^AGGCAATG
>B4
^TCACCTAG
>B6
^CATACGGA
>B8
^GTCATCGT
>B10
^TTACCGAC
>B12
^ACCTTCGA
>B14
^ACGCTTCT
>B16
^GAGTAGAG
>B18
^ATGCCTAG
>B20
^CAACTCCA
>B22
^AAGTCCTC
>B24
^GTCGATTG
>D2
^GCGTTAGA
>D4
^TTGCGAGA
>D6
^ACACCGAT
>D8
^CGTATCTC
>D10
^AAGGAGAC
>D12
^TGTCGACT
>D14
^TCAATCCG
>D16
^GACTTGTG
>D18
^CCGATGTA
>D20
^TAGGAGCT
>D22
^CAACGAGT
>D24
^TGTGTCAG
>F2
^CTAGGTTG
>F4
^GTGTCCTT
>F6
^TACCTGCA
>F8
^CCTTAGGT
>F10
^CACAGACT
>F12
^TCGAACCT
>F14
^GCATAGTC
>F16
^CTCCTGAA
>F18
^AACGCACA
>F20
^TAGTCTCG
>F22
^ACTCTGAG
>F24
^GTTATGGC
>H2
^CTCGGTAA
>H4
^TACAGAGC
>H6
^GCATAACG
>H8
^GATCAGAC
>H10
^CGCAACTA
>H12
^TCCGATCA
>H14
^CAACTTGG
>H16
^TCAGTAGG
>H18
^ACAGCAAG
>H20
^GAATGGCA
>H22
^CGGATCAA
>H24
^ACTGCACT
>J2
^CCTAAGTC
>J4
^TTCGTACG
>J6
^TCCTGGTA
>J8
^CATTGACG
>J10
^ACCTCTTC
>J12
^CATTCGTC
>J14
^TTCCTCCT
>J16
^GCTGTAAG
>J18
^GACATCTC
>J20
^CAACCGTA
>J22
^TGCGATAG
>J24
^TGGTTCGA
>L2
^AAGCGTTC
>L4
^CGATTCTG
>L6
^GCAACCAT
>L8
^AATCCAGC
>L10
^AGTGCATC
>L12
^GCATTGGT
>L14
^CTTAGGAC
>L16
^ATAGTCGG
>L18
^GAGACCAA
>L20
^AACAAGGC
>L22
^CCAGTATC
>L24
^CCTCGAAT
>N2
^CAACTGAC
>N4
^TGCTCTAC
>N6
^CATCACGT
>N8
^GCCACTTA
>N10
^GCTTCACA
>N12
^ACCGAATG
>N14
^CTCACCAA
>N16
^CAGAACTG
>N18
^AGAAGCCT
>N20
^CACGATTC
>N22
^AAGCTGGT
>N24
^GCAATGAG
>P2
^CTCTATCG
>P4
^ACTCTCCA
>P6
^CAGCATAC
>P8
^TACTCCAG
>P10
^GAGGCATT
>P12
^ACACCTCA
>P14
^CGCAATGT
>P16
^CCTAGAGA
>P18
^TACTAGCG
>P20
^CGTCCATT
>P22
^TCGCTATC
>P24
^AATGGTCG

In [None]:
%%bash
cat > ../Scripts/A02a_demultiplex_fastq.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A02a_demultiplex.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=6:00:00,h_data=24G
#$ -t 1-16
#$ -N A02a_demultip
#$ -hold_jid_ad A01a_merge_lanes



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# extract target filepaths -----------------------------------------------------

platename=$(awk -v linenum=$SGE_TASK_ID -F ',' '$1 == linenum' $metadat_plate | cut -d , -f 2)

r1file="fastq_raw/${platename}_R1.fastq.gz"
r2file="fastq_raw/${platename}_R2.fastq.gz"



# apply perl script for each of two index files --------------------------------

for index_file in {1..2}
do
    echo -e "\n\ndemultiplexing ${r1file} and ${r2file} with index set ${index_file} of 2..."
    perl Scripts/A02a_demultiplex_fastq.pl $r1file $r2file \
    Scripts/A02a_cellbarcodes_subset${index_file}.fa
done


echo -e "\n\n'A02a_demultip' completed.\n\n"




echo " "
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "


## (A02b) check demultiplexing output

In [None]:
%%bash
cat > ../Scripts/A02b_check_demultip.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A02b_check_demult.$JOB_ID
#$ -j y
#$ -l h_rt=1:00:00,h_data=4G
#$ -N A02b_check_demult
#$ -hold_jid A02a_demultip



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--




# extract target filepaths------------------------------------------------------

query_metadat () {
  awk -F',' -v targetcol="$1" \
      'NR==1 {
                for (i=1;i<=NF;i++) {
                    if ($i==targetcol) {assayout=i; break} }
                print $assayout
              } 
      NR>1 {
                print $assayout
            }' $metadat_well
}

check_filepaths_in_assay() {
    for file in $@
        do 
        if [[ ! -s $file ]]
            then
                echo "missing '$file'"
            fi
        done
}

r1demultip=($(query_metadat "A02a_fqgz_demultip_R1"))
r2demultip=($(query_metadat "A02a_fqgz_demultip_R2"))



# apply checks for A01a output -------------------------------------------------
# (may be later depreciated due to explicit target filename change)



echo -e "\n\n-----------------------------------------------------------------"
echo "A. counting number of .fastq.gz files generated during demultiplexing step... "
echo "-----------------------------------------------------------------"

nfastq_tot_demultip=0

echo -e "\nR1.fastq.gz files\n"
for prefix in $(cut -f 2 -d ',' $metadat_plate | tail -n +2);
    do
        nfastq_plate=$(
            find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name ${prefix}*R1.fastq.gz | wc -l)
        echo "$prefix : $nfastq_plate"
        let "nfastq_tot_demultip+=$nfastq_plate"
done

echo -e "\nR2.fastq.gz files\n"
for prefix in $(cut -f 2 -d ',' $metadat_plate | tail -n +2);
    do
        nfastq_plate=$(
            find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name ${prefix}*R2.fastq.gz | wc -l)
        echo "$prefix : $nfastq_plate"
        let "nfastq_tot_demultip+=$nfastq_plate"
done

num_platesin=$(cut -f 2 -d ',' $metadat_plate | tail -n +2 | wc -l)
nfastq_expected=$(($num_platesin*384*2))
nfastq_demult_folder=$(find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name '*.fastq.gz' | wc -l)
fastq_missing=$(($nfastq_expected - $nfastq_tot_demultip))

echo -e "* num demultiplexed .fastqs expected: ($num_platesin plates)*384*2 = $nfastq_expected R1 & R2"
echo -e "\n* num detected, summed from per-plate values above: ${nfastq_tot_demultip}"
echo "* num .fastqs in fastq_demultip: ${nfastq_demult_folder} (regardless of expected plate prefix; should be same as above)"
echo "* num .fastqs missing: ${fastq_missing} ($((${fastq_missing}/2)) empty wells?)"




echo -e "\n\n-----------------------------------------------------------------"
echo "B. tabulating number of empty wells detected by perl script per plate... "
echo -e "-----------------------------------------------------------------\n"

grep "rm fastq" -c sublogs/A02a*

echo -e "\n* # missing wells, based on # demultiplexing indices undetected in perl script."
echo "* should be consistent with [A], with same number of wells as detected cell barcodes."




echo -e "\n\n-----------------------------------------------------------------"
echo "C. checking expected output from well-level metadata file (A01c)..."
echo "-----------------------------------------------------------------"

echo -e "\nchecking Read 1 .fastq.gz files:\n"
check_filepaths_in_assay ${r1demultip[@]}

echo -e "\nchecking Read 2 .fastq.gz files:\n"
check_filepaths_in_assay ${r2demultip[@]}

echo -e "\n* checks the A02a output columns of 'metadat_well' if the file exists and is non-empty."
echo "* if none missing, will only output target column names above."
echo "* if some declared 'missing' but all other checks OK, cell barcode may not exist."
echo "  (check 'fastq_demultip/plate_summary.txt report)"




echo -e "\n\n-----------------------------------------------------------------"
echo "D. checking that each .fastq in demultiplex folder has a paired-end (R1/R2) mate..."
echo "-----------------------------------------------------------------"

nfastq_no_pairmate=0
for r1file in `find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name '*indexed_R1.fastq.gz'`;
do
        if [ ! -e "${r1file/R1/R2}" ]
            then
                echo "missing after demultiplexing: ${r1file/R1/R2}"
                let "nfastq_no_pairmate+=1"
            fi
        done
for r2file in `find "fastq_demultip/" -maxdepth 1 -mindepth 1 -name '*indexed_R2.fastq.gz'`;
do
        if [ ! -e "${r2file/R2/R1}" ]
            then
                echo "missing after demultiplexing: ${r2file/R2/R1}"
                let "nfastq_no_pairmate+=1"
            fi
        done

echo -e "\n* $nfastq_no_pairmate cases where a well has their paired-end mate missing after demultiplexing."
echo "* if non-zero, there are wells with their R1 but no corresponding R2 reads, or their R2 but no R1."




echo -e "\n\n-----------------------------------------------------------------"
echo "E. checking that two summary .txt files were generated for each file... "
echo -e "-----------------------------------------------------------------\n"

summary_txt_missing=0
for prefix in $(cut -f 2 -d ',' $metadat_plate | tail -n +2);
    do
        if [[ $(find "fastq_demultip/" -maxdepth 1 -name "${prefix}*.txt" | wc -l) != 2 ]]
            then 
            let "summary_txt_missing+=1"
            echo "summary files missing for : ${prefix}"
        fi
    done

echo -e "* $summary_txt_missing 'raw_fastq' files are missing their summary .txt file(s)."
echo "* if any summaries are missing, the demultiplexing jobs may have failed (historically, memory issues)"




echo -e "\n\n-----------------------------------------------------------------"
echo "F. printing percent barcodes unassigned..."
echo -e "-----------------------------------------------------------------\n"

for prefix in $(cut -f 2 -d ',' $metadat_plate | tail -n +2);
do
    echo $prefix
    for summaryfile in fastq_demultip/${prefix}*summary*.txt
    do
        awk 'NR==2' $summaryfile | cut -f 4
    done
done

echo -e "\n* unassigned reads should be ~50%, assuming two barcode ref files (summary_1.txt, summary_2.txt)"
echo "  caveat: sum generally >100% b/c of reads 'double counted' as unassigned to both barcode lists."
echo "* total unassigned expected 1-3%, usually due to ambig bases (N) or sequencing errors;"
echo "  higher rates suggestive of contaminating library, issues in lane merging (fastq_raw input)"




echo -e "\n\n'A02b_check_demultip' completed.\n\n"


echo " "
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "



## (A02c) fastqc on random well subset

In [None]:
%%bash
cat > ../Scripts/A02c_fastqc_demultip_fastq.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A02c_fastqc_demultip.$JOB_ID
#$ -j y
#$ -N A02c_fastqc_demultip
#$ -l h_rt=8:00:00,h_data=8G
#$ -pe shared 4
#$ -hold_jid A02a_demultip



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "




# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--


# fastqc specific parameters
n_wells=4  # <-- num wells/plate to sample
overwrite_random_wells=false # <-- overwrite target list of wells (false by default)

mkdir qc_demultip




# choose random wells ------------------—------------------—--------------------

# randomly select $n_wells from each prefix associated with 'raw_fastq' R1s to run fastqc on
# & saves the names of the corresponding demultiplexed R1 .fastq in "$wells_to_run"

# warning: will overwrite this .txt if already exists,
# to avoid re-running more files than expected / the same files multiple times

if [[ -e $wells_to_run && "$overwrite_random_wells" == true ]]
    then
    
        echo "warning: Scripts/A02c_random_fastqc_wells.txt already exists."
        echo "deleting and re-randomizing. (overwrite_random_wells=true)"

        rm $wells_to_run
fi

# make random samples (otherwise skip)
if [[ ! -e $wells_to_run ]]
then
    for prefix in $(cut -f 2 -d ',' $metadat_plate | tail -n +2);
        do
        printf '%s\n' $(ls fastq_demultip/${prefix}*R1*fastq.gz \
                        | shuf \
                        | head -n $n_wells) \
             >> $wells_to_run
        done
else
    echo "warning: Scripts/A02c_random_fastqc_wells.txt already exists."
    echo "using existing random wells. (overwrite_random_wells=false)"
fi

# print wells for record in sublog
echo -e "\nnum wells sampled: $(wc -l $wells_to_run)\n"
cat $wells_to_run



# apply fastqc ------------------—------------------—---------------------------

# run fastqc on the randomly selected wells
echo "running fastqc on each well."
while read r1file;
    do
        r2file=${r1file/R1/R2}
        fastqc -t 4 ${r1file} -o qc_demultip/
        fastqc -t 4 ${r2file} -o qc_demultip/
    done < $wells_to_run

# compile across wells
echo "aggregating via multiqc."
multiqc -d qc_demultip -o qc_demultip -n multiqc_fastqc \
    --cl-config "sp: { fastqc/zip: {fn: '*_fastqc.zip' } }" -m "fastqc"




echo -e "\n\n'A02c_fastqc_demultip_fastq' completed.\n\n"


echo " "
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "
