In [1]:
import pandas as pd
import os

In [49]:
# To link RBHs between species_a AND species_b, the RBH output with XP_/NP_ identifiers (from OnubDmel_rbh_v2.txt)
# needs to be associated with species_b's NCBI gene ids/descriptions

# Define base_paths for supporting files & RBH output
base_path = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/supporting_files/"
output_path = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/output/"

a_gp_mapping = 'onub_gp_mapping.txt'
b_gp_mapping = 'bmor_gp_mapping.txt'
b_genes_all = 'bmor_genes_all.tsv'
rbh_results = 'OnubBmor_rbh_v2.txt'
output_matches = 'onub2bmor_rbh_matches.txt'
output_all = 'onub2bmor_rbh_all.txt'

# Read in files containing...

# Species A GeneID/Protein mappings (output from extract_gp_mappings.sh)
a_gp_mapping = pd.read_csv(base_path + a_gp_mapping, sep='\t')
a_gp_mapping.head(n=5)

# Species B GeneID/Protein mappings (output from extract_gp_mappings.sh)
b_gp_mapping = pd.read_csv(base_path + b_gp_mapping, sep='\t')
b_gp_mapping.head(n=5)

# Species B GeneID/GeneName_Symbol_Description mappings (downloaded from NCBI Datasets)
b_gene_names = pd.read_csv(base_path + b_genes_all, sep='\t')
b_gene_names.head(n=5)

# RBH results file (output from find_rbh_onub2dmel.ipynb)
output_rbh = pd.read_csv(output_path + rbh_results, sep='\t')
output_rbh.head(n=5)

Unnamed: 0,ostrinia_nubilalis,drosophila_melanogaster
0,XP_063820880.1,NP_649286.2
1,XP_063820881.1,NP_524647.2
2,XP_063820883.1,NP_647982.1
3,XP_063820886.1,NP_609739.2
4,XP_063820887.1,NP_609939.1


In [50]:
# Perform a left join on 'NCBI GeneID' from dmel_gene_names and 'Gene ID' from dmel_gp_mapping
combined_table = pd.merge(b_gene_names, b_gp_mapping, left_on='NCBI GeneID', right_on='Gene ID', how='left')
combined_table.head(n=5)

Unnamed: 0,NCBI GeneID,Symbol,Description,Taxonomic Name,Common Name,Gene Type,Transcripts,Gene Group Identifier,Gene Group Method,Gene ID,Protein accession
0,31271,w,white,Drosophila melanogaster,fruit fly,PROTEIN_CODING,1.0,31271.0,NCBI Ortholog,31271.0,NP_476787.1
1,31293,N,Notch,Drosophila melanogaster,fruit fly,PROTEIN_CODING,2.0,31293.0,NCBI Ortholog,31293.0,NP_476859.2
2,34009,wg,wingless,Drosophila melanogaster,fruit fly,PROTEIN_CODING,1.0,34009.0,NCBI Ortholog,34009.0,NP_523502.1
3,33432,dpp,decapentaplegic,Drosophila melanogaster,fruit fly,PROTEIN_CODING,4.0,33432.0,NCBI Ortholog,33432.0,NP_477311.1
4,36240,en,engrailed,Drosophila melanogaster,fruit fly,PROTEIN_CODING,2.0,36240.0,NCBI Ortholog,36240.0,NP_523700.2


In [51]:
# Subset the combined table to include only the desired columns
subset_table = combined_table[['NCBI GeneID', 'Protein accession', 'Symbol', 'Description', 'Taxonomic Name']]
subset_table.head(n=5)

Unnamed: 0,NCBI GeneID,Protein accession,Symbol,Description,Taxonomic Name
0,31271,NP_476787.1,w,white,Drosophila melanogaster
1,31293,NP_476859.2,N,Notch,Drosophila melanogaster
2,34009,NP_523502.1,wg,wingless,Drosophila melanogaster
3,33432,NP_477311.1,dpp,decapentaplegic,Drosophila melanogaster
4,36240,NP_523700.2,en,engrailed,Drosophila melanogaster


In [52]:
# Join subset_table with output_rbh (remember, python is 0-based indexing)
final_rbh = pd.merge(output_rbh, subset_table, left_on=output_rbh.columns[1], right_on='Protein accession', how='left')

# Subset table to retain columns 0 (species a), 1 (species b), 2 (species b geneID), 4 (species b gene symbol), 5 (description)
final_rbh = final_rbh.iloc[:, [0,1,2,4,5]]
#final_rbh = final_rbh[['ostrinia_nubilalis', 'drosophila_melanogaster', 'NCBI GeneID', 'Symbol', 'Description']]

final_rbh.head(n=25)

Unnamed: 0,ostrinia_nubilalis,drosophila_melanogaster,NCBI GeneID,Symbol,Description
0,XP_063820880.1,NP_649286.2,40335,CG42337,uncharacterized protein
1,XP_063820881.1,NP_524647.2,43892,sif,still life
2,XP_063820883.1,NP_647982.1,38638,Blimp-1,Blimp-1
3,XP_063820886.1,NP_609739.2,34887,CG15269,uncharacterized protein
4,XP_063820887.1,NP_609939.1,35179,Dus4,Dihydrouridine synthase 4
5,XP_063820889.1,NP_611890.1,45021,Zfrp8,Zinc finger protein RP-8
6,XP_063820893.1,NP_572728.1,32101,CG11699,uncharacterized protein
7,XP_063820894.1,NP_648056.1,38748,velo,veloren
8,XP_063820896.1,NP_648944.1,39903,CG6664,uncharacterized protein
9,XP_063820898.1,NP_524031.2,39349,byn,brachyenteron


In [53]:
# need to add the Onub gene IDs so that I can link the peptides BACK to the RNA-seq results
a_gp_mapping.head(n=10)

# Add in species A (Onub) gene IDs, retain ONLY rows with a match to Dmel
matched_rbh = pd.merge(a_gp_mapping, final_rbh, left_on=a_gp_mapping.columns[1], right_on=final_rbh.columns[0], how='right')

# Retain ALL rows regardless of match
all_rbh = pd.merge(a_gp_mapping, final_rbh, left_on=a_gp_mapping.columns[1], right_on=final_rbh.columns[0], how='left')


# Subset
matched_rbh = matched_rbh.iloc[:, [0,1,3,4,5,6]]
all_rbh = all_rbh.iloc[:, [0,1,3,4,5,6]]
all_rbh.head(n=5)

Unnamed: 0,Gene ID,Protein accession,drosophila_melanogaster,NCBI GeneID,Symbol,Description
0,135074699,XP_063825140.1,,,,
1,135074877,XP_063825357.1,,,,
2,135074055,XP_063824464.1,NP_476866.1,31357.0,Xpac,Xeroderma pigmentosum group A-like
3,135074348,XP_063824713.1,NP_648808.1,39726.0,CG15715,uncharacterized protein
4,135075295,XP_063825797.1,NP_648950.1,39910.0,beg,bad egg


In [54]:
# Save the final RBH results to a new file
matched_rbh.to_csv(output_path + output_matches, sep='\t', index=False)

all_rbh.to_csv(output_path + output_all, sep='\t', index=False)

