-
Notifications
You must be signed in to change notification settings - Fork 0
10_GENE_TABLES
eolesin edited this page Jul 1, 2021
·
46 revisions
Switched back over to kjempefuru for this purpose. Creating a table of gene coverages from each sample assembly to itself. The idea here is to get the coverages from each sample for genes and later match up these gene calls based on the KO number annotated for each. This will require pandas dataframe joining.
This will make one synthetic "bin".
for i in `cat AMOR_2020_Good`; do anvi-script-add-default-collection \
-c 03_INDIV_ASSEMBLY/${i}/${i}.prefixed.contigs.db \
-p 04_INDIV_PROFILES/06_PROFILES/${i}/${i}_to_${i}/PROFILE.db \
-C ALL_CONTIGS -b ALL_CONTIGS; done
for i in `cat AMOR_2020_Good`; do anvi-summarize \
-c 03_INDIV_ASSEMBLY/${i}/${i}.prefixed.contigs.db \
-p 04_INDIV_PROFILES/06_PROFILES/${i}/${i}_to_${i}/PROFILE.db \
-C ALL_CONTIGS -o 10_GENE_TABLES/${i} --init-gene-coverages \
--report-aa-seqs-for-gene-calls; done
# Gene coverages w. unique gene call number:
/export/dahlefs/work/Metagenomes_chimneys_2020_workfolder/10_GENE_TABLES/${sample}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_coverages.txt
# Gene calls w. unique gene call number found at:
/export/dahlefs/work/Metagenomes_chimneys_2020_workfolder/10_GENE_TABLES/${sample}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt
# header like so:
gene_callers_id contig start stop direction KEGG_Class KEGG_Class (ACCESSION) KOfam KOfam (ACCESSION) KEGG_ModuleKEGG_Module (ACCESSION) Pfam Pfam (ACCESSION) dna_sequence aa_sequence
4. Edit the tables to include sample names as columns, and then to only consist of columns you want to process.
# add filename as a column
for i in `cat AMOR_2020_Good`; do words=$(wc -l < ${i}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt);
yes ${i} | head -n ${words} > ${i}_out.txt;
paste ${i}_out.txt ${i}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt > ${i}_gene_calls.txt ;
done
# get rid of the filepath info behind sample name.
for i in `cat AMOR_2020_Good`;
do sed -i 's#/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt##g' ${i}_gene_calls.txt ;
done
# replace the header with SampleName instead of the name of the individual sample
for i in `cat AMOR_2020_Good`;
do sed '/'"${i}"'/{s//SampleName/;:p;n;bp}' ${i}_gene_calls.txt >> ${i}_short.txt;
done
# Fill blank fields with "NaN"
for j in `cat AMOR_2020_Good`;
do awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = "NaN" }; \
1' ${j}_short.txt > ${j}_full.txt;
done
# Join all the tables into one large table
awk '
FNR==1 && NR!=1 { while (/^<header>/) getline; }
1 {print}
' *full.txt >all_genes.txt
# put samplename as first column
for i in `cat AMOR_2020_Good`; do awk '{print FILENAME (NF?" ":"") $0}' \
${i}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_coverages.txt > ${i}_gene_coveragenew.txt ; done
# get rid of filepath
for i in `cat AMOR_2020_Good`; do sed -i 's#/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_coverages.txt##g' \
${i}_gene_coveragenew.txt ; done
# replace the header with column names
for i in `cat AMOR_2020_Good`; do sed -i '1s/.*/SampleName gene_callers_id coverage/'\
${i}_coverage_short.txt; done
paste all_gene.txt all_gene_coverages.txt > all_all.txt