# Download all protein sequences in orthoDB v11 for one/multiple species

1. odb11v0_OG2genes.tab.gz were downloaded from https://data.orthodb.org/download/
wget https://data.orthodb.org/download/odb11v0_OG2genes.tab.gz
2. get the OG ids that are related to oil palm (taxid 51953)
zcat odb11v0_OG2genes.tab.gz | grep 51953_0 | cut -f 1 | sort | uniq > OG_51953_0.tab
3. Download the oil palm protein sequences from the clusters that contains oil palm sequences
nohup sh download.sh > logfile 2>&1 < /dev/null &

download.sh:
#!/bin/bash
output_file="combined_data.tsv"

while IFS= read -r line; do
    curl "https://data.orthodb.org/current/fasta?id=$line&species=51953" -L -k >> "$output_file"
done < "OG_51953_0.tab"

## parse the search.data

In [23]:
import json

file_path = "/bioinfo/palm/ref/orthoDBv11/search.dat"

with open(file_path, "r") as json_file:
    data_dict = json.load(json_file)

ortholog_path = "/bioinfo/palm/ref/orthoDBv11/orthologs.dat"

with open(ortholog_path, "r") as json_file:
    ortholog_dict = json.load(json_file)

# parse the downloaded protein sequence to remove the redundent ones

In [37]:
from Bio import SeqIO
import json

sequence_dic = {}
tag_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        if record.id in sequence_dic:
            # gene already wrote out, check whether the sequence is different
            if record.seq != sequence_dic[record.id]:
                # need to write to fh if the sequence is different
                gene, description = record.description.split(' ', 1)
                description = description.strip()
                description_dict = json.loads(description)
                cluster = description_dict['pub_og_id']
                record.id = record.id + '_' + cluster
                tag_dic[gene] = cluster
                SeqIO.write(record, fh, 'fasta')
                print(gene) #nothing was printed, which means the protein sequences always stay the same no matter the node
        else:
            SeqIO.write(record, fh, 'fasta')
        sequence_dic[record.id] = record.seq
        

# Now A simpler code with the understanding that the protein sequences always stay the same for the same gene no matter the node

In [44]:
from Bio import SeqIO
import json

sequence_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        gene, description = record.description.split(' ', 1)
        description = description.strip()
        description_dict = json.loads(description)
        cluster = description_dict['pub_og_id']
        level = cluster.split('at')[1]
        if level in level_dic:
            level_dic[level].append(gene)
        else:
            level_dic[level] = [gene]
        if record.id not in sequence_dic:
            SeqIO.write(record, fh, 'fasta')
            sequence_dic[record.id] = record.seq
        
        

In [45]:
lengths_dict = {}

# Iterate through the dictionary values and calculate lengths
for key, value in level_dic.items():
    unique_items = set(value)  # Create a set to store unique items
    lengths_dict[key] = len(unique_items)

print(lengths_dict)

{'2759': 25657, '33090': 25541, '3193': 25459, '4447': 24754}


# This means one gene is classified into only one cluster at most at one level. 

# Then we used orthomapper to map all dura proteins to orthodb v11. Results are in /bioinfo/tools/orthologer/Results

# Now let's get the protein sequences that wasn't placed to any orthodb clusters (14632).

# Note that if a gene was not placed in orthoDBv11 by othomapper, it won't be found in the reciprocal mapping between dura and EG5inOrthoDBv11 either. 

# Now what do we do? Map to EG5 just to see what they could have done?

In [19]:
import pandas as pd
from Bio import SeqIO

file_path = 'dura.og.annotations'
df = pd.read_csv(file_path, sep='\t', header=0)
df.rename(columns={'#query': 'query'}, inplace=True)
df_eg5 = pd.read_csv('dura_EG5_orthoDB11.tbl', sep='\t', header=None)
i = 0
with open('dura_proteins_not_in_orthodbv11.fasta','w') as fh:
    for record in SeqIO.parse('../dura/dura_proteins.fasta','fasta'):
        if record.id not in df['query'].values:
            if record.id in df_eg5[0]:
                print(record.id)
            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)


14632


# Merge the orthomapper results from all three levels, with priority to lower levels.

In [8]:
import pandas as pd
from Bio import SeqIO

file1 = 'dura4447_Results/dura4447.og.annotations'
df1 = pd.read_csv(file1, sep='\t', header=0)
df1.rename(columns={'#query': 'query'}, inplace=True)

file2 = 'dura3193_Results/dura3193.og.annotations'
df2 = pd.read_csv(file2, sep='\t', header=0)
df2.rename(columns={'#query': 'query'}, inplace=True)

df2_new = df2[~df2['query'].isin(df1['query'])]
df12 = pd.concat([df1, df2_new], ignore_index=True)

file3 = 'dura33090_Results/dura33090.og.annotations'
df3 = pd.read_csv(file3, sep='\t', header=0)
df3.rename(columns={'#query': 'query'}, inplace=True)

df3_new = df3[~df3['query'].isin(df12['query'])]
df123 = pd.concat([df12, df3_new], ignore_index=True)

df123.to_csv('dura_orthoDBv11_4447_3193_33090.og.annotations', sep='\t', index=False)

# Now we need to go from gene accession to clusterID, then dura gene ID.
Play with the API

In [45]:
import subprocess, json

gene_name = 'ABI5'
level = 4447
cmd = "curl 'https://data.orthodb.org/current/search?query={}&level={}' -L -o {}_at{}.dat".format(gene_name, level, gene_name, level)
print(cmd)
subprocess.run(cmd, shell=True, check=True)
with open("{}_at{}.dat".format(gene_name, level), "r") as json_file:
    data_dict = json.load(json_file)
anno_file = 'dura_orthoDBv11_4447_3193_33090.og.annotations'
df = pd.read_csv(anno_file, sep='\t', header=0)
selected_rows = df[df['ODB_OG'].isin(data_dict['data'])]
selected_rows.rename(columns={'#query': 'query'}, inplace=True)
i = 0
protein_file = '/bioinfo/palm/ref/dura/dura_proteins.fasta'
with open('dura_proteins_{}_at{}.aa'.format(gene_name, level),'w') as fh:
    for record in SeqIO.parse(protein_file,'fasta'):
        if record.id in selected_rows['query'].values:
            cluster = selected_rows.loc[selected_rows['query'] == record.id, 'ODB_OG']
            tag = selected_rows.loc[selected_rows['query'] == record.id, 'Description']
            if len(tag) == 1:
                record.description = ':'.join([cluster.iloc[0], tag.iloc[0]])
            else:
                print(record.id + 'appeared in more than one cluster?')

            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)

curl 'https://data.orthodb.org/current/search?query=ABI5&level=4447' -L -o ABI5_at4447.dat
9


In [37]:
len(selected_rows.loc[selected_rows['query'] == 'Egu023084-mRNA-1', 'ODB_OG'])

1

In [39]:
f

Unnamed: 0,query,ODB_OG,evalue,score,COG_category,Description,GOs_mf,GOs_bp,EC,KEGG_ko,Interpro
1739,Egu024817-mRNA-1,16748at4447,5.3599999999999996e-263,195.9,-,RNA-binding protein 10 isoform X1,-,-,"3.1.3.12,3.1.4.4","ko00500,ko00564,ko00565,ko03010,ko04144,osa011...","IPR000467,IPR041591"
6466,Egu007769-mRNA-1,56001at4447,3.11e-180,172.15,T,E3 ubiquitin-protein ligase AIP2,-,"GO:0016567,GO:0009737,GO:0009788",3.1.3.2,ko00905,"IPR001841,IPR013083"
21018,Egu004902-mRNA-1,128216at4447,7e-66,175.62,"O,M",Prefoldin subunit 4,GO:0051082,-,-,"ko04120,sita04016","IPR002777,IPR009053,IPR016661"
21494,Egu002268-mRNA-1,129070at4447,0.0,170.907,-,FRIGIDA-like protein,-,GO:0030154,2.1.1.-,"ko03018,ko00270,ko00920,ko01200,ko01230,ko0301...",IPR012474
21495,Egu023084-mRNA-1,129070at4447,1.12e-277,180.37,-,FRIGIDA-like protein,-,GO:0030154,2.1.1.-,"ko03018,ko00270,ko00920,ko01200,ko01230,ko0301...",IPR012474
23299,Egu019493-mRNA-1,132091at4447,0.0,100.0,-,Regulatory protein viviparous-1,-,GO:0009737,1.11.1.7,"ko00010,ko00710,ko00910,ko00940,ko01200,ko0123...","IPR003340,IPR015300,IPR044800"
23300,Egu019493-mRNA-2,132091at4447,5.6e-232,106.54,-,Regulatory protein viviparous-1,-,GO:0009737,1.11.1.7,"ko00010,ko00710,ko00910,ko00940,ko01200,ko0123...","IPR003340,IPR015300,IPR044800"
26837,Egu029266-mRNA-1,138207at4447,0.0,99.1897,"J,K,T","Zinc finger, RanBP2-type",GO:0046872,-,-,-,"IPR000467,IPR000504,IPR001876,IPR012677,IPR035..."
26838,Egu029266-mRNA-2,138207at4447,0.0,99.1791,"J,K,T","Zinc finger, RanBP2-type",GO:0046872,-,-,-,"IPR000467,IPR000504,IPR001876,IPR012677,IPR035..."
26839,Egu029266-mRNA-3,138207at4447,0.0,158.08,"J,K,T","Zinc finger, RanBP2-type",GO:0046872,-,-,-,"IPR000467,IPR000504,IPR001876,IPR012677,IPR035..."


In [13]:
len(df123)

36026

In [14]:
duplicates = df123[df123.duplicated('query')]  # Replace 'column_name' with the actual column name


In [1]:
import pandas as pd
anno_file = 'dura_orthoDBv11_4447_3193_33090.og.annotations'
df = pd.read_csv(anno_file, sep='\t', header=0)
tpm_file = 'counts_transcript_TPM_concise.txt'
tpm = pd.read_csv(tpm_file, sep=',', header=0)


In [3]:
df_1 = df[['query','Description']]
df_2 = pd.merge(df_1, tpm, on='query',how='outer')

In [4]:
df_2.to_csv('output.csv', index=False)