# Assigning GO Slim Terms

In this notebook, I'll assign GO Slim terms to differentially expressed genes for *Zostera marina* and *Labyrinthula zosterae*. This will help with downstream interpretation of biological processes impacted by infection.

## 0. Set working directory

In [1]:
pwd

'/Users/yaaminivenkataraman/Documents/project-EWD-transcriptomics/scripts'

In [2]:
cd ../analyses/GO-MWU/DE-GO-MWU/

/Users/yaaminivenkataraman/Documents/project-EWD-transcriptomics/analyses/GO-MWU/DE-GO-MWU


## 1. Format differentially expressed gene lists

### *Z. marina*

In [46]:
#Check current file formats
!head 2019-07-15-Zostera-DEG-GOterms.tab

seq	term
TRINITY_DN296376_c0_g1	GO:0016407
TRINITY_DN314167_c0_g1	GO:0003993
TRINITY_DN298173_c0_g1	GO:0022804
TRINITY_DN299750_c3_g1	GO:0022804
TRINITY_DN312095_c1_g3	GO:0022804
TRINITY_DN231504_c0_g1	GO:0016411;GO:0008374
TRINITY_DN256326_c0_g1	GO:0016411;GO:0008374
TRINITY_DN292999_c3_g1	GO:0016411;GO:0008374
TRINITY_DN223830_c0_g1	GO:0033218


In [76]:
#Remove header line
# Sort file and save as new file
!tail -n +2 2019-07-15-Zostera-DEG-GOterms.tab \
| sort > 2019-07-15-Zostera-DEG-GOterms-noHead.tab

In [3]:
!head 2019-07-15-Zostera-DEG-GOterms-noHead.tab

TRINITY_DN102431_c0_g1	GO:0000049
TRINITY_DN104822_c0_g1	GO:0003677
TRINITY_DN111591_c0_g1	GO:0016829
TRINITY_DN111591_c0_g1	GO:0016836;GO:0016835
TRINITY_DN151705_c0_g1	GO:0003756;GO:0016864
TRINITY_DN151705_c0_g1	GO:0016853
TRINITY_DN151705_c0_g1	GO:0016860
TRINITY_DN172940_c0_g1	GO:0008168
TRINITY_DN172940_c0_g1	GO:0016741
TRINITY_DN175296_c0_g1	GO:0016491


In [43]:
#Add space after each ; delimiter and save as a few file
!sed 's/;/; /g' 2019-07-15-Zostera-DEG-GOterms-noHead.tab > Zostera_blast-annot.tab

In [44]:
#Check output
!head -20 Zostera_blast-annot.tab

TRINITY_DN102431_c0_g1	GO:0000049
TRINITY_DN104822_c0_g1	GO:0003677
TRINITY_DN111591_c0_g1	GO:0016829
TRINITY_DN111591_c0_g1	GO:0016836; GO:0016835
TRINITY_DN151705_c0_g1	GO:0003756; GO:0016864
TRINITY_DN151705_c0_g1	GO:0016853
TRINITY_DN151705_c0_g1	GO:0016860
TRINITY_DN172940_c0_g1	GO:0008168
TRINITY_DN172940_c0_g1	GO:0016741
TRINITY_DN175296_c0_g1	GO:0016491
TRINITY_DN175296_c0_g1	GO:0016616; GO:0016614
TRINITY_DN2119_c1_g1	GO:0005507
TRINITY_DN2119_c1_g1	GO:0046872; GO:0043169
TRINITY_DN2119_c1_g1	GO:0046914
TRINITY_DN223640_c0_g1	GO:0004518
TRINITY_DN223640_c0_g1	GO:0016788
TRINITY_DN223640_c0_g1	GO:0140098
TRINITY_DN223830_c0_g1	GO:0033218
TRINITY_DN226183_c0_g1	GO:0004372; GO:0016742
TRINITY_DN226183_c0_g1	GO:0016741


### *L. zosterae*

In [71]:
#Check current file formats
!head 2019-07-15-nonZostera-DEG-GOterms.tab

seq	term
TRINITY_DN274805_c0_g2	GO:0003779;GO:0008092
TRINITY_DN276416_c0_g1	GO:0005524;GO:0008144;GO:0032559;GO:0035639;GO:0030554
TRINITY_DN449830_c0_g1	GO:0005524;GO:0008144;GO:0032559;GO:0035639;GO:0030554
TRINITY_DN312726_c1_g4	GO:0008509
TRINITY_DN316900_c0_g1	GO:0004190;GO:0070001
TRINITY_DN251134_c1_g1	GO:0015085
TRINITY_DN259150_c0_g1	GO:0016836;GO:0016835
TRINITY_DN292324_c0_g1	GO:0004806;GO:0052689
TRINITY_DN277209_c0_g1	GO:0004180


In [72]:
#Remove header line and save as new file
!tail -n +2 2019-07-15-nonZostera-DEG-GOterms.tab > 2019-07-15-nonZostera-DEG-GOterms-noHead.tab

In [34]:
!head 2019-07-15-nonZostera-DEG-GOterms-noHead.tab

TRINITY_DN274805_c0_g2	GO:0003779;GO:0008092
TRINITY_DN276416_c0_g1	GO:0005524;GO:0008144;GO:0032559;GO:0035639;GO:0030554
TRINITY_DN449830_c0_g1	GO:0005524;GO:0008144;GO:0032559;GO:0035639;GO:0030554
TRINITY_DN312726_c1_g4	GO:0008509
TRINITY_DN316900_c0_g1	GO:0004190;GO:0070001
TRINITY_DN251134_c1_g1	GO:0015085
TRINITY_DN259150_c0_g1	GO:0016836;GO:0016835
TRINITY_DN292324_c0_g1	GO:0004806;GO:0052689
TRINITY_DN277209_c0_g1	GO:0004180
TRINITY_DN22073_c0_g1	GO:0005261


In [35]:
#Add space after each ; delimiter and save as a few file
!sed 's/;/; /g' 2019-07-15-nonZostera-DEG-GOterms-noHead.tab > nonZostera_blast-annot.tab

In [36]:
#Check output
!head -20 nonZostera_blast-annot.tab

TRINITY_DN274805_c0_g2	GO:0003779; GO:0008092
TRINITY_DN276416_c0_g1	GO:0005524; GO:0008144; GO:0032559; GO:0035639; GO:0030554
TRINITY_DN449830_c0_g1	GO:0005524; GO:0008144; GO:0032559; GO:0035639; GO:0030554
TRINITY_DN312726_c1_g4	GO:0008509
TRINITY_DN316900_c0_g1	GO:0004190; GO:0070001
TRINITY_DN251134_c1_g1	GO:0015085
TRINITY_DN259150_c0_g1	GO:0016836; GO:0016835
TRINITY_DN292324_c0_g1	GO:0004806; GO:0052689
TRINITY_DN277209_c0_g1	GO:0004180
TRINITY_DN22073_c0_g1	GO:0005261
TRINITY_DN251134_c1_g1	GO:0005261
TRINITY_DN22073_c0_g1	GO:0005216; GO:0022838; GO:0015267; GO:0022803
TRINITY_DN251134_c1_g1	GO:0005216; GO:0022838; GO:0015267; GO:0022803
TRINITY_DN314831_c1_g5	GO:0008234
TRINITY_DN314232_c4_g4	GO:0003677
TRINITY_DN316900_c0_g1	GO:0004175
TRINITY_DN277209_c0_g1	GO:0008238
TRINITY_DN293894_c0_g2	GO:0008238
TRINITY_DN312093_c0_g3	GO:0003924
TRINITY_DN293386_c0_g1	GO:0017111; GO:0016462; GO:0016818; GO:0016817


## 2. Match to GO Slim terms

### *Z. marina*

In [45]:
%%bash 

# This script was originally written to address a specific problem that Rhonda was having

# input_file is the initial, "problem" file
# file is an intermediate file that most of the program works upon
# output_file is the final file produced by the script
input_file="Zostera_blast-annot.tab"
file="Zostera_intermediate.file"
output_file="Zostera_blast-GO-unfolded.tab"

# sed command substitutes the "; " sequence to a tab and writes the new format to a new file.
# This character sequence is how the GO terms are delimited in their field.
sed $'s/; /\t/g' "$input_file" > "$file"

# Identify first field containing a GO term.
# Search file with grep for "GO:" and pipe to awk.
# Awk sets tab as field delimiter (-F'\t'), runs a for loop that looks for "GO:" (~/GO:/), and then prints the field number).
# Awk results are piped to sort, which sorts unique by number (-ug).
# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; "-n1").
begin_goterms=$(grep "GO:" "$file" | awk -F'\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)

# While loop to process each line of the input file.
while read -r line
	do
	
	# Send contents of the current line to awk.
	# Set the field separator as a tab (-F'\t') and print the number of fields in that line.
	# Save the results of the echo/awk pipe (i.e. number of fields) to the variable "max_field".
	max_field=$(echo "$line" | awk -F'\t' '{print NF}')

	# Send contents of current line to cut.
	# Cut fields (i.e. retain those fields) 1-12.
	# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable "fixed_fields"
	fixed_fields=$(echo "$line" | cut -f1-2)

	# Since not all the lines contain the same number of fields (e.g. may not have GO terms),
	# evaluate the number of fields in each line to determine how to handle current line.

	# If the value in max_field is less than the field number where the GO terms begin,
	# then just print the current line (%s) followed by a newline (\n).
	if (( "$max_field" < "$begin_goterms" ))
		then printf "%s\n" "$line"
			else

			# Send contents of current line (which contains GO terms) to cut.
			# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.
			# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable "goterms".
			goterms=$(echo "$line" | cut -f"$begin_goterms"-"$max_field")
			
			# Assign values in the variable "goterms" to a new indexed array (called "array"), 
			# with tab delimiter (IFS=$'\t')
			IFS=$'\t' read -r -a array <<<"$goterms"
			
			# Iterate through each element of the array.
			# Print the first 12 fields (i.e. the fields stored in "fixed_fields") followed by a tab (%s\t).
			# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\n).
			for element in "${!array[@]}"	
				do printf "%s\t%s\n" "$fixed_fields" "${array[$element]}"
			done
	fi

# Send the input file into the while loop and send the output to a file named "rhonda_fixed.txt".
done < "$file" > "$output_file"

In [46]:
#It was unfolded correctly but the second column is not correct
!head -20 Zostera_blast-GO-unfolded.tab

TRINITY_DN102431_c0_g1	GO:0000049	GO:0000049
TRINITY_DN104822_c0_g1	GO:0003677	GO:0003677
TRINITY_DN111591_c0_g1	GO:0016829	GO:0016829
TRINITY_DN111591_c0_g1	GO:0016836	GO:0016836
TRINITY_DN111591_c0_g1	GO:0016836	GO:0016835
TRINITY_DN151705_c0_g1	GO:0003756	GO:0003756
TRINITY_DN151705_c0_g1	GO:0003756	GO:0016864
TRINITY_DN151705_c0_g1	GO:0016853	GO:0016853
TRINITY_DN151705_c0_g1	GO:0016860	GO:0016860
TRINITY_DN172940_c0_g1	GO:0008168	GO:0008168
TRINITY_DN172940_c0_g1	GO:0016741	GO:0016741
TRINITY_DN175296_c0_g1	GO:0016491	GO:0016491
TRINITY_DN175296_c0_g1	GO:0016616	GO:0016616
TRINITY_DN175296_c0_g1	GO:0016616	GO:0016614
TRINITY_DN2119_c1_g1	GO:0005507	GO:0005507
TRINITY_DN2119_c1_g1	GO:0046872	GO:0046872
TRINITY_DN2119_c1_g1	GO:0046872	GO:0043169
TRINITY_DN2119_c1_g1	GO:0046914	GO:0046914
TRINITY_DN223640_c0_g1	GO:0004518	GO:0004518
TRINITY_DN223640_c0_g1	GO:0016788	GO:0016788


In [47]:
#Only retain the first and third columns
#Sort
#Save as a new file
!awk '{print $1"\t"$3}' Zostera_blast-GO-unfolded.tab \
| sort \
> Zostera_blast-GO-unfolded-correct.tab

In [48]:
#Confirm output
!head Zostera_blast-GO-unfolded-correct.tab

TRINITY_DN102431_c0_g1	GO:0000049
TRINITY_DN104822_c0_g1	GO:0003677
TRINITY_DN111591_c0_g1	GO:0016829
TRINITY_DN111591_c0_g1	GO:0016835
TRINITY_DN111591_c0_g1	GO:0016836
TRINITY_DN151705_c0_g1	GO:0003756
TRINITY_DN151705_c0_g1	GO:0016853
TRINITY_DN151705_c0_g1	GO:0016860
TRINITY_DN151705_c0_g1	GO:0016864
TRINITY_DN172940_c0_g1	GO:0008168


### *L. zosterae*

In [38]:
#It was unfolded correctly but the second column is not correct
!head -20 nonZostera_blast-annot.tab

TRINITY_DN274805_c0_g2	GO:0003779; GO:0008092
TRINITY_DN276416_c0_g1	GO:0005524; GO:0008144; GO:0032559; GO:0035639; GO:0030554
TRINITY_DN449830_c0_g1	GO:0005524; GO:0008144; GO:0032559; GO:0035639; GO:0030554
TRINITY_DN312726_c1_g4	GO:0008509
TRINITY_DN316900_c0_g1	GO:0004190; GO:0070001
TRINITY_DN251134_c1_g1	GO:0015085
TRINITY_DN259150_c0_g1	GO:0016836; GO:0016835
TRINITY_DN292324_c0_g1	GO:0004806; GO:0052689
TRINITY_DN277209_c0_g1	GO:0004180
TRINITY_DN22073_c0_g1	GO:0005261
TRINITY_DN251134_c1_g1	GO:0005261
TRINITY_DN22073_c0_g1	GO:0005216; GO:0022838; GO:0015267; GO:0022803
TRINITY_DN251134_c1_g1	GO:0005216; GO:0022838; GO:0015267; GO:0022803
TRINITY_DN314831_c1_g5	GO:0008234
TRINITY_DN314232_c4_g4	GO:0003677
TRINITY_DN316900_c0_g1	GO:0004175
TRINITY_DN277209_c0_g1	GO:0008238
TRINITY_DN293894_c0_g2	GO:0008238
TRINITY_DN312093_c0_g3	GO:0003924
TRINITY_DN293386_c0_g1	GO:0017111; GO:0016462; GO:0016818; GO:0016817


In [31]:
#Only retain the first and third columns
#Sort
#Save as a new file
!awk '{print $1"\t"$3}' Zostera_blast-GO-unfolded.tab \
| sort \
> Zostera_blast-GO-unfolded-correct.tab

In [32]:
#Confirm output
!head Zostera_blast-GO-unfolded-correct.tab

TRINITY_DN102431_c0_g1	GO:0000049
TRINITY_DN104822_c0_g1	GO:0003677
TRINITY_DN111591_c0_g1	GO:0016829
TRINITY_DN111591_c0_g1	GO:0016835
TRINITY_DN111591_c0_g1	GO:0016836
TRINITY_DN151705_c0_g1	GO:0003756
TRINITY_DN151705_c0_g1	GO:0016853
TRINITY_DN151705_c0_g1	GO:0016860
TRINITY_DN151705_c0_g1	GO:0016864
TRINITY_DN172940_c0_g1	GO:0008168
