# Download all protein sequences in orthoDB v11 for one/multiple species

1. odb11v0_OG2genes.tab.gz were downloaded from https://data.orthodb.org/download/
wget https://data.orthodb.org/download/odb11v0_OG2genes.tab.gz
2. get the OG ids that are related to oil palm (taxid 51953)
zcat odb11v0_OG2genes.tab.gz | grep 51953_0 | cut -f 1 | sort | uniq > OG_51953_0.tab
3. Download the oil palm protein sequences from the clusters that contains oil palm sequences
nohup sh download.sh > logfile 2>&1 < /dev/null &

download.sh:
#!/bin/bash
output_file="combined_data.tsv"

while IFS= read -r line; do
    curl "https://data.orthodb.org/current/fasta?id=$line&species=51953" -L -k >> "$output_file"
done < "OG_51953_0.tab"

In [None]:
curl 'https://data.orthodb.org/current/blast?seq=MGQMGGPDGDGPHHQYHYQALLAAVQNPSQGLHVPLHAGAGAPAAGPGPRPGADADASSTHNANATPHSQPPRAFTDWSASNSAFAAQPAPATTNTPFHYNLSQSYALWTHYMLNKNVSYSTYSTPHEPLRHTHIPDKYSGCAFSLGFDSFTTMSLGPNICANMTPMERSISAKEPENSEDLPTVVRSSDEMDTRNSGDVRRDTVDTLPESKQSHESCASVSNKFDSGEYQVILRKELTKSDVANSGRIVLPKKDAEAGLPPLVQGDPLILQMDDMVLPIIWKFKYRFWPNNKSRMYILEAAGEFVKTHGPSGRGYAHYLQKLRTWQIYYPWGEVHSADNP' -L -o blast.dat

In [1]:
!curl 'https://data.orthodb.org/current/blast?seq=MGQMGGPDGDGPHHQYHYQALLAAVQNPSQGLHVPLHAGAGAPAAGPGPRPGADADASSTHNANATPHSQPPRAFTDWSASNSAFAAQPAPATTNTPFHYNLSQSYALWTHYMLNKNVSYSTYSTPHEPLRHTHIPDKYSGCAFSLGFDSFTTMSLGPNICANMTPMERSISAKEPENSEDLPTVVRSSDEMDTRNSGDVRRDTVDTLPESKQSHESCASVSNKFDSGEYQVILRKELTKSDVANSGRIVLPKKDAEAGLPPLVQGDPLILQMDDMVLPIIWKFKYRFWPNNKSRMYILEAAGEFVKTHGPSGRGYAHYLQKLRTWQIYYPWGEVHSADNP' -L -o blast.dat

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2368    0  2368    0     0   1121      0 --:--:--  0:00:02 --:--:--  1122


In [2]:
!pwd

/home/huan/GitHub/script


## parse the search.data

In [23]:
import json

file_path = "/bioinfo/palm/ref/orthoDBv11/search.dat"

with open(file_path, "r") as json_file:
    data_dict = json.load(json_file)

ortholog_path = "/bioinfo/palm/ref/orthoDBv11/orthologs.dat"

with open(ortholog_path, "r") as json_file:
    ortholog_dict = json.load(json_file)

# parse the downloaded protein sequence to remove the redundent ones

In [37]:
from Bio import SeqIO
import json

sequence_dic = {}
tag_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        if record.id in sequence_dic:
            # gene already wrote out, check whether the sequence is different
            if record.seq != sequence_dic[record.id]:
                # need to write to fh if the sequence is different
                gene, description = record.description.split(' ', 1)
                description = description.strip()
                description_dict = json.loads(description)
                cluster = description_dict['pub_og_id']
                record.id = record.id + '_' + cluster
                tag_dic[gene] = cluster
                SeqIO.write(record, fh, 'fasta')
                print(gene) #nothing was printed, which means the protein sequences always stay the same no matter the node
        else:
            SeqIO.write(record, fh, 'fasta')
        sequence_dic[record.id] = record.seq
        

# Now A simpler code with the understanding that the protein sequences always stay the same for the same gene no matter the node

In [44]:
from Bio import SeqIO
import json

sequence_dic = {}
level_dic = {}
with open('/bioinfo/palm/ref/orthoDBv11/oil_palm_protein.fa','w') as fh:
    for record in SeqIO.parse('/bioinfo/palm/ref/orthoDBv11/combined_data.tsv','fasta'):
        gene, description = record.description.split(' ', 1)
        description = description.strip()
        description_dict = json.loads(description)
        cluster = description_dict['pub_og_id']
        level = cluster.split('at')[1]
        if level in level_dic:
            level_dic[level].append(gene)
        else:
            level_dic[level] = [gene]
        if record.id not in sequence_dic:
            SeqIO.write(record, fh, 'fasta')
            sequence_dic[record.id] = record.seq
        
        

In [45]:
lengths_dict = {}

# Iterate through the dictionary values and calculate lengths
for key, value in level_dic.items():
    unique_items = set(value)  # Create a set to store unique items
    lengths_dict[key] = len(unique_items)

print(lengths_dict)

{'2759': 25657, '33090': 25541, '3193': 25459, '4447': 24754}


# This means one gene is classified into only one cluster at most at one level. 

# Then we used orthomapper to map all dura proteins to orthodb v11. Results are in /bioinfo/tools/orthologer/Results

# Now let's get the protein sequences that wasn't placed to any orthodb clusters (14632).

# Note that if a gene was not placed in orthoDBv11 by othomapper, it won't be found in the reciprocal mapping between dura and EG5inOrthoDBv11 either. 

# Now what do we do? Map to EG5 just to see what they could have done?

In [19]:
import pandas as pd
from Bio import SeqIO

file_path = 'dura.og.annotations'
df = pd.read_csv(file_path, sep='\t', header=0)
df.rename(columns={'#query': 'query'}, inplace=True)
df_eg5 = pd.read_csv('dura_EG5_orthoDB11.tbl', sep='\t', header=None)
i = 0
with open('dura_proteins_not_in_orthodbv11.fasta','w') as fh:
    for record in SeqIO.parse('../dura/dura_proteins.fasta','fasta'):
        if record.id not in df['query'].values:
            if record.id in df_eg5[0]:
                print(record.id)
            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)


14632


# Merge the orthomapper results from all three levels, with priority to lower levels.

In [8]:
import pandas as pd
from Bio import SeqIO

file1 = 'dura4447_Results/dura4447.og.annotations'
df1 = pd.read_csv(file1, sep='\t', header=0)
df1.rename(columns={'#query': 'query'}, inplace=True)

file2 = 'dura3193_Results/dura3193.og.annotations'
df2 = pd.read_csv(file2, sep='\t', header=0)
df2.rename(columns={'#query': 'query'}, inplace=True)

df2_new = df2[~df2['query'].isin(df1['query'])]
df12 = pd.concat([df1, df2_new], ignore_index=True)

file3 = 'dura33090_Results/dura33090.og.annotations'
df3 = pd.read_csv(file3, sep='\t', header=0)
df3.rename(columns={'#query': 'query'}, inplace=True)

df3_new = df3[~df3['query'].isin(df12['query'])]
df123 = pd.concat([df12, df3_new], ignore_index=True)

df123.to_csv('dura_orthoDBv11_4447_3193_33090.og.annotations', sep='\t', index=False)

# Now we need to go from gene name to clusterID, then dura gene ID.
Play with the API

In [45]:
import subprocess, json

gene_name = 'ABI5'
level = 4447
cmd = "curl 'https://data.orthodb.org/current/search?query={}&level={}' -L -o {}_at{}.dat".format(gene_name, level, gene_name, level)
print(cmd)
subprocess.run(cmd, shell=True, check=True)
with open("{}_at{}.dat".format(gene_name, level), "r") as json_file:
    data_dict = json.load(json_file)
anno_file = 'dura_orthoDBv11_4447_3193_33090.og.annotations'
df = pd.read_csv(anno_file, sep='\t', header=0)
selected_rows = df[df['ODB_OG'].isin(data_dict['data'])]
selected_rows.rename(columns={'#query': 'query'}, inplace=True)
i = 0
protein_file = '/bioinfo/palm/ref/dura/dura_proteins.fasta'
with open('dura_proteins_{}_at{}.aa'.format(gene_name, level),'w') as fh:
    for record in SeqIO.parse(protein_file,'fasta'):
        if record.id in selected_rows['query'].values:
            cluster = selected_rows.loc[selected_rows['query'] == record.id, 'ODB_OG']
            tag = selected_rows.loc[selected_rows['query'] == record.id, 'Description']
            if len(tag) == 1:
                record.description = ':'.join([cluster.iloc[0], tag.iloc[0]])
            else:
                print(record.id + 'appeared in more than one cluster?')

            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)

curl 'https://data.orthodb.org/current/search?query=ABI5&level=4447' -L -o ABI5_at4447.dat
9


In [37]:
len(selected_rows.loc[selected_rows['query'] == 'Egu023084-mRNA-1', 'ODB_OG'])

1

# We ran orthoDB on four other species and compared them

In [12]:
import pandas as pd
from collections import Counter
level = 4447
def og_dic(level, name):
    # this function takes a orthoDB annotation, count the cluser compotion and return a pd
    anno_file = "/bioinfo2/palm/ref/orthoDBv11/{}_{}.og.annotations".format(name, level)
    df = pd.read_csv(anno_file, sep='\t', header=0)
    dic = Counter(df['ODB_OG'])
    # Convert dictionaries to DataFrames
    df = pd.DataFrame(list(dic.items()), columns=['key', name]).set_index('key')
    return(df)

df_dura = og_dic(4447, 'dura')
df_coco = og_dic(4447, 'coco')
df_date = og_dic(4447, 'date')
df_picifera = og_dic(4447, 'EG5')
df_rice = og_dic(4447, 'rice')

df = pd.concat([df_dura, df_coco, df_date, df_picifera, df_rice], axis=1)

df.to_csv("/bioinfo2/palm/ref/orthoDBv11/combined_og.csv")


In [11]:
df

Unnamed: 0_level_0,dura,coco,date,EG5,rice
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6at4447,1.0,1.0,1.0,1.0,1.0
12at4447,1.0,1.0,3.0,3.0,2.0
20at4447,2.0,1.0,4.0,3.0,1.0
40at4447,2.0,1.0,3.0,3.0,2.0
71at4447,2.0,1.0,5.0,3.0,1.0
...,...,...,...,...,...
160426at4447,,,,,7.0
160446at4447,,,,,1.0
160453at4447,,,,,1.0
160476at4447,,,,,1.0


In [14]:
duplicates = df123[df123.duplicated('query')]  # Replace 'column_name' with the actual column name


In [1]:
import pandas as pd
anno_file = 'dura_orthoDBv11_4447_3193_33090.og.annotations'
df = pd.read_csv(anno_file, sep='\t', header=0)
tpm_file = 'counts_transcript_TPM_concise.txt'
tpm = pd.read_csv(tpm_file, sep=',', header=0)


In [3]:
df_1 = df[['query','Description']]
df_2 = pd.merge(df_1, tpm, on='query',how='outer')

In [4]:
df_2.to_csv('output.csv', index=False)