# Download all protein sequences in orthoDB v11 for one/multiple species

1. odb11v0_OG2genes.tab.gz were downloaded from https://data.orthodb.org/download/
wget https://data.orthodb.org/download/odb11v0_OG2genes.tab.gz
2. get the OG ids that are related to oil palm (taxid 51953)
zcat odb11v0_OG2genes.tab.gz | grep 51953_0 | cut -f 1 | sort | uniq > OG_51953_0.tab
3. Download the oil palm protein sequences from the clusters that contains oil palm sequences
nohup sh download.sh > logfile 2>&1 < /dev/null &

download.sh:
#!/bin/bash
output_file="combined_data.tsv"

while IFS= read -r line; do
    curl "https://data.orthodb.org/current/fasta?id=$line&species=51953" -L -k >> "$output_file"
done < "OG_51953_0.tab"

## parse the search.data

In [23]:
import json

file_path = "/bioinfo/palm/ref/orthoDBv11/search.dat"

with open(file_path, "r") as json_file:
    data_dict = json.load(json_file)

ortholog_path = "/bioinfo/palm/ref/orthoDBv11/orthologs.dat"

with open(ortholog_path, "r") as json_file:
    ortholog_dict = json.load(json_file)

# parse the downloaded protein sequence to remove the redundent ones

In [37]:
from Bio import SeqIO
import json

sequence_dic = {}
tag_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        if record.id in sequence_dic:
            # gene already wrote out, check whether the sequence is different
            if record.seq != sequence_dic[record.id]:
                # need to write to fh if the sequence is different
                gene, description = record.description.split(' ', 1)
                description = description.strip()
                description_dict = json.loads(description)
                cluster = description_dict['pub_og_id']
                record.id = record.id + '_' + cluster
                tag_dic[gene] = cluster
                SeqIO.write(record, fh, 'fasta')
                print(gene) #nothing was printed, which means the protein sequences always stay the same no matter the node
        else:
            SeqIO.write(record, fh, 'fasta')
        sequence_dic[record.id] = record.seq
        

# Now A simpler code with the understanding that the protein sequences always stay the same for the same gene no matter the node

In [44]:
from Bio import SeqIO
import json

sequence_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        gene, description = record.description.split(' ', 1)
        description = description.strip()
        description_dict = json.loads(description)
        cluster = description_dict['pub_og_id']
        level = cluster.split('at')[1]
        if level in level_dic:
            level_dic[level].append(gene)
        else:
            level_dic[level] = [gene]
        if record.id not in sequence_dic:
            SeqIO.write(record, fh, 'fasta')
            sequence_dic[record.id] = record.seq
        
        

In [45]:
lengths_dict = {}

# Iterate through the dictionary values and calculate lengths
for key, value in level_dic.items():
    unique_items = set(value)  # Create a set to store unique items
    lengths_dict[key] = len(unique_items)

print(lengths_dict)

{'2759': 25657, '33090': 25541, '3193': 25459, '4447': 24754}


# This means one gene is classified into only one cluster at most at one level. 

# Then we used orthomapper to map all dura proteins to orthodb v11. Results are in /bioinfo/tools/orthologer/Results

# Now let's get the protein sequences that wasn't placed to any orthodb clusters (14632).

# Note that if a gene was not placed in orthoDBv11 by othomapper, it won't be found in the reciprocal mapping between dura and EG5inOrthoDBv11 either. 

# Now what do we do? Map to EG5 just to see what they could have done?

In [19]:
import pandas as pd
from Bio import SeqIO

file_path = 'dura.og.annotations'
df = pd.read_csv(file_path, sep='\t', header=0)
df.rename(columns={'#query': 'query'}, inplace=True)
df_eg5 = pd.read_csv('dura_EG5_orthoDB11.tbl', sep='\t', header=None)
i = 0
with open('dura_proteins_not_in_orthodbv11.fasta','w') as fh:
    for record in SeqIO.parse('../dura/dura_proteins.fasta','fasta'):
        if record.id not in df['query'].values:
            if record.id in df_eg5[0]:
                print(record.id)
            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)


14632


  """Entry point for launching an IPython kernel.


In [17]:
df.eg5[0]

0          Egu032749-mRNA-1
1          Egu032749-mRNA-1
2          Egu032749-mRNA-1
3          Egu032749-mRNA-1
4          Egu032749-mRNA-1
5          Egu032749-mRNA-1
6          Egu032749-mRNA-1
7          Egu032749-mRNA-1
8          Egu032749-mRNA-1
9          Egu032749-mRNA-1
10         Egu032749-mRNA-1
11         Egu032749-mRNA-1
12         Egu032749-mRNA-1
13         Egu032749-mRNA-1
14         Egu032749-mRNA-1
15         Egu032749-mRNA-1
16         Egu032749-mRNA-1
17         Egu032749-mRNA-1
18         Egu032749-mRNA-1
19         Egu032749-mRNA-1
20         Egu032749-mRNA-1
21         Egu032749-mRNA-1
22         Egu032749-mRNA-1
23         Egu032749-mRNA-1
24         Egu032749-mRNA-1
25         Egu032749-mRNA-1
26         Egu032749-mRNA-1
27         Egu032749-mRNA-1
28         Egu032749-mRNA-1
29         Egu032749-mRNA-1
                 ...       
1210785    Egu032211-mRNA-2
1210786    Egu032211-mRNA-2
1210787    Egu032211-mRNA-2
1210788    Egu032211-mRNA-2
1210789    Egu032211