-
Notifications
You must be signed in to change notification settings - Fork 0
10_GENE_TABLES
eolesin edited this page Aug 26, 2021
·
46 revisions
Switched back over to kjempefuru for this purpose. Creating a table of gene coverages from each sample assembly to itself. The idea here is to get the coverages from each sample for genes and later match up these gene calls based on the KO number annotated for each. This will require pandas dataframe joining.
This will make one synthetic "bin".
for i in `cat AMOR_2020_Good`; do anvi-script-add-default-collection \
-c 03_INDIV_ASSEMBLY/${i}/${i}.prefixed.contigs.db \
-p 04_INDIV_PROFILES/06_PROFILES/${i}/${i}_to_${i}/PROFILE.db \
-C ALL_CONTIGS -b ALL_CONTIGS; done
for i in `cat AMOR_2020_Good`; do anvi-summarize \
-c 03_INDIV_ASSEMBLY/${i}/${i}.prefixed.contigs.db \
-p 04_INDIV_PROFILES/06_PROFILES/${i}/${i}_to_${i}/PROFILE.db \
-C ALL_CONTIGS -o 10_GENE_TABLES/${i} --init-gene-coverages \
--report-aa-seqs-for-gene-calls; done
# Gene coverages w. unique gene call number:
/export/dahlefs/work/Metagenomes_chimneys_2020_workfolder/10_GENE_TABLES/${sample}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_coverages.txt
# Gene calls w. unique gene call number found at:
/export/dahlefs/work/Metagenomes_chimneys_2020_workfolder/10_GENE_TABLES/${sample}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt
# header like so:
gene_callers_id contig start stop direction KEGG_Class KEGG_Class (ACCESSION) KOfam KOfam (ACCESSION) KEGG_ModuleKEGG_Module (ACCESSION) Pfam Pfam (ACCESSION) dna_sequence aa_sequence
# COVERAGE TABLE
# Remove the header from all coverage files.
for i in `cat AMOR_2020_Good`; do tail -n +2 ${i}_gene_coverages.txt > ${i}_gene_coveragenew.txt; done
# add sample names as a column
for i in `cat AMOR_2020_Good`; do words=$(wc -l < ${i}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt);
yes ${i} | head -n ${words} > ${i}_out.txt;
paste ${i}_out.txt ${i}_gene_coveragenew.txt > ${i}_gene_covint.txt ;
done
######################
# GENE CALLS TABLE
# fill in blanks with "NaN". This will hopefully prevent sliding around of columns later.
for i in `cat AMOR_2020_Good`;
awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = "NaN" }1' ${i}/bin_by_bin/ALL_CONTIGS/ALL_CONTIGS-gene_calls.txt > ${i}_full.txt;
done
# Make a separate file that includes just the header as header.txt
# Remove the header from all gene_calls.txt files.
for i in `cat AMOR_2020_Good`; do tail -n +2 ${i}_gene_calls.txt > ${i}_gene_int.txt; done
# Combine coverage and gene tables for each sample
# include only columns we care about.
for i in `cat AMOR_2020_Good`;
do
awk 'BEGIN { FS = OFS = "\t" } {print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13}' \
${i}_gene_int.txt > ${i}_gene_int_squash.txt;
paste ${i}_gene_covint.txt ${i}_gene_int_squash.txt > ${i}_everything.txt;
done
# Concat all the gene tables
cat *_everything.txt > everything_nohead.txt
# Handle the header
awk 'BEGIN { FS = OFS = "\t" } {print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13}' header.txt > squash_header.txt
paste cov_header.txt squash_header.txt > final_header.txt
# Add the final header back on.
cat final_header.txt everything_nohead.txt > all_final.txt
# Add unique identifier at the front of each line.
awk 'BEGIN { FS = OFS = "\t" } {
if(NR==1)
{
print "UNIQID", $s;
}
else
{
print "UNIQ"NR, $s;
}}' all_final.txt > all_final_w_ID.txt
# Print the totals of coverages from each sample
tail all_final.txt -n+2 | awk -F '\t' '{a[$1]+= $3} END{for (i in a) print i, a[i]}' > coverage_totals.txt
# Make a table that just represents knowns
awk -v OFS="\t" '$1=$1' coverage_totals_justknown.txt > coverage_totals_justknown2.txt # replace annoying space separators with tabs.
sort coverage_totals_justknown.txt | join - coverage_totals.txt | uniq > cov_all_test # sort and join with totals table.
# Make a file that will become a new column containing the total coverage per sample in each line of the
# main gene info file. Håkon helped figure this out.
cut -f2 all_final_w_ID.txt > outtest.txt
while read a b; do sed "s/$a/$b/" outtest.txt > outnew.tmp; echo $a; mv outnew.tmp outtest.txt; done < coverage_totals.txt
sed 's/SampleName/coverage_total/g' outtest.txt > outtest2.txt
# from there we can paste this into the main file.
paste all_final_w_ID.txt outtest2.txt > final_w_totals.txt
cut -f1,2,
awk -F '\t' '(NR>1) {$10 = ($3/$9); print}' final_for_R.txt > final_for_R2.txt
Edit the table to exclude unknowns and evaluate a decent cutoff for % abundance to include
awk '{if ($5 !="NaN" || $6 !="NaN" || $7 !="NaN" || $8 !="NaN") print $0}' \
final_for_R2_sorted.txt > for_R_justknown.txt
# Remove the header from all gene_calls.txt files.
for i in `cat AMOR_2020_Good`; do tail -n +2 ${i}_full.txt > ${i}_gene_int.txt; done
# print just the bits we need
for i in `cat AMOR_2020_Good`;
do awk 'BEGIN { FS = OFS = "\t" } \
{print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $15}'\
${i}_gene_int.txt > ${i}_gene_int_squash.txt;
paste ${i}_gene_covint.txt ${i}_gene_int_squash.txt > ${i}_everything.txt;
done
#Concatenate gene table files together
cat *_everything.txt > everything_nohead.txt
# Remove last line, which curiously has no entry
sed -i '$ d' everything_nohead.txt
# Keep only names in header that still exist in table
awk 'BEGIN { FS = OFS = "\t" } \
{print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $15}' \
header.txt > squash_header.txt
# make final header
paste cov_header.txt squash_header.txt > final_header.txt
# put header on file
cat final_header.txt everything_nohead.txt > all_final.txt
# Create unique ID for each entry
1086 awk 'BEGIN { FS = OFS = "\t" } {
if(NR==1)
{
print "UNIQID", $s;
}
else
{
print "UNIQ"NR, $s;
}}' all_final.txt > all_final_w_ID.txt
# Get rid of lines that have no amino acid sequence
awk '{if ($18 !="NaN") print $0}' all_final_w_ID.txt > Has_AA.txt
# Check how many entries you have.
wc -l Has_AA.txt