# Step 6. Second round of Orthogrouping

In [1]:
conda activate dl20-orthogrouping
# need gff2bed installed 

#See Luisa if you have questions

Aims:

- redo the orthogrouping based on the new annotations and perform several more rigorous orthogrouping analyses
- Orthofinder for proteins
- Orthofinder for nucleotide sequences of the coding region (including introns?)
- Orthofinder for nucleotide sequences of the flanking regions (on their own)
- Blast for inclusive clusters
- Kmer based clustering for finer scale clusters
- Graph based clustering?

### Part 1: Output nucleotide sequences for the coding regions and translate into aa

Input:
- Fixed gffs (output/02_annotation/02_manual-curation-updated/at6137/${accession}.manualcuration.fixed.gff3)
- Scaffolds (output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta)

In [2]:
dl20=(
at6137 at6923 at6929 at7143
at8285 at9104 at9336 at9503
at9578 at9744 at9762 at9806
at9830 at9847 at9852 at9879
at9883 at9900
)

In [11]:
# output the fasta sequences
mkdir -p tmp/02_annotation/06_orthogrouping/bed
for accession in "${dl20[@]}"
do 
gff2bed < output/02_annotation/02_manual-curation-updated/${accession}/${accession}.manualcuration.gff3 > tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted.bed
done

In [22]:
mkdir -p tmp/02_annotation/06_orthogrouping/fasta
for accession in "${dl20[@]}"
do
    grep gene tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted.bed > tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted_genes.bed
    bedtools getfasta -s -name \
       -fi output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta \
       -bed tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted_genes.bed > tmp/02_annotation/06_orthogrouping/fasta/${accession}_all_features.fasta
done

In [31]:
#Output original NLR orthogroups as orthogroup fasta files
mkdir -p tmp/02_annotation/06_orthogrouping/original_orthogroup_fasta
cp src/fasta_formatter_general_with_an_orthofinder_list_whole_genome.py tmp/02_annotation/06_orthogrouping/fasta/
cp input/NLR_orthologs.csv tmp/02_annotation/06_orthogrouping/fasta/
cd tmp/02_annotation/06_orthogrouping/fasta/
python3 fasta_formatter_general_with_an_orthofinder_list_whole_genome.py -t NLR_orthologs.csv *.fasta
mv *.fa ../../../../tmp/02_annotation/06_orthogrouping/original_orthogroup_fasta/
rm fasta_formatter_general_with_an_orthofinder_list_whole_genome.py
rm NLR_orthologs.csv
cd ..
cd ..
cd ..
cd ..

In [38]:
# edit bed files such that you get 500bp flanking region

mkdir -p tmp/02_annotation/06_orthogrouping/bed_flanking_500
mkdir -p tmp/02_annotation/06_orthogrouping/fasta_flanking_500
for accession in "${dl20[@]}"
do
    bedtools slop -i tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted_genes.bed -g output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta.fai -b 500 > tmp/02_annotation/06_orthogrouping/bed_flanking_500/${accession}_sorted_genes_500_flanking.bed
    bedtools getfasta -s -name \
       -fi output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta \
       -bed tmp/02_annotation/06_orthogrouping/bed_flanking_500/${accession}_sorted_genes_500_flanking.bed > tmp/02_annotation/06_orthogrouping/fasta_flanking_500/${accession}_all_features_flanking_500.fasta
done


In [39]:
#Output original NLR orthogroups as orthogroup fasta files with 500bp flanking either side of the gene
mkdir -p tmp/02_annotation/06_orthogrouping/original_orthogroup_fasta_500bp_flanking
cp src/fasta_formatter_general_with_an_orthofinder_list_whole_genome.py tmp/02_annotation/06_orthogrouping/fasta_flanking_500
cp input/NLR_orthologs.csv tmp/02_annotation/06_orthogrouping/fasta_flanking_500/
cd tmp/02_annotation/06_orthogrouping/fasta_flanking_500
python3 fasta_formatter_general_with_an_orthofinder_list_whole_genome.py -t NLR_orthologs.csv *.fasta
mv *.fa ../../../../tmp/02_annotation/06_orthogrouping/original_orthogroup_fasta_500bp_flanking/
rm fasta_formatter_general_with_an_orthofinder_list_whole_genome.py
rm NLR_orthologs.csv
cd ..
cd ..
cd ..
cd ..

In [6]:
# using just the mRNA of the liftoff to pull out the fasta sequences

mkdir -p tmp/02_annotation/06_orthogrouping/bed_liftoff
for accession in "${dl20[@]}"
do 
gff2bed < output/01_assembly/01_pansn-named/${accession}.liftoff-v2.3.gff > tmp/02_annotation/06_orthogrouping/bed_liftoff/${accession}_sorted.bed
done


In [7]:
mkdir -p tmp/02_annotation/06_orthogrouping/fasta_liftoff
for accession in "${dl20[@]}"
do
    grep mRNA tmp/02_annotation/06_orthogrouping/bed_liftoff/${accession}_sorted.bed > tmp/02_annotation/06_orthogrouping/bed_liftoff/${accession}_sorted_mRNA.bed
    bedtools getfasta -s -name \
       -fi output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta \
       -bed tmp/02_annotation/06_orthogrouping/bed_liftoff/${accession}_sorted_mRNA.bed > tmp/02_annotation/06_orthogrouping/fasta_liftoff/${accession}_all_features.fasta
done

In [17]:
#write own python editor for bed files to get the flanking region

#tmp/02_annotation/06_orthogrouping/bed/${accession}_sorted_genes.bed

mkdir -p tmp/02_annotation/06_orthogrouping/bed_with_max500_but_non-overlaping
cp flanking_regions_bed_editor.py tmp/02_annotation/06_orthogrouping/bed/

cd tmp/02_annotation/06_orthogrouping/bed/

for accession in "${dl20[@]}"
do 
    python3 flanking_regions_bed_editor.py \
    -g ${accession}_sorted_genes.bed \
    -f ../../../../output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta.fai \
    -m 500 > ../bed_with_max500_but_non-overlaping/${accession}_sorted_genes_with_flanking.bed
done

cd /home/luisa/work/detlef/difflines_annex/assembly-and-annotation/



In [19]:
mkdir -p tmp/02_annotation/06_orthogrouping/fasta_with_max100_flanking_but_not_overlaping
for accession in "${dl20[@]}"
do
    bedtools getfasta -s -name \
       -fi output/01_assembly/01_pansn-named/${accession}.scaffolds-v2.3.fasta \
       -bed tmp/02_annotation/06_orthogrouping/bed_with_max500_but_non-overlaping/${accession}_sorted_genes_with_flanking.bed \
       > tmp/02_annotation/06_orthogrouping/fasta_with_max100_flanking_but_not_overlaping/${accession}_all_features_flanking.fasta
done


In [20]:
mkdir -p original_orthogroup_fasta_flanking_but_not_overlaping_500

#Output original NLR orthogroups as orthogroup fasta files with 500bp flanking either side of the gene
cp src/fasta_formatter_general_with_an_orthofinder_list_whole_genome.py tmp/02_annotation/06_orthogrouping/fasta_with_max100_flanking_but_not_overlaping/
cp input/NLR_orthologs.csv tmp/02_annotation/06_orthogrouping/fasta_with_max100_flanking_but_not_overlaping/
cd tmp/02_annotation/06_orthogrouping/fasta_with_max100_flanking_but_not_overlaping
python3 fasta_formatter_general_with_an_orthofinder_list_whole_genome.py -t NLR_orthologs.csv *.fasta
mv *.fa ../../../../tmp/02_annotation/06_orthogrouping/original_orthogroup_fasta_flanking_but_not_overlaping_500/
rm fasta_formatter_general_with_an_orthofinder_list_whole_genome.py
rm NLR_orthologs.csv
cd ..
cd ..
cd ..
cd ..