# Imports

In [1]:
import math
import gzip 
import json
import os
import glob


import pandas as pd
import screed
from tqdm import tqdm
# import seaborn as sns

# %matplotlib inline

In [2]:
from path_constants import (
    QFO_FOLDER,
    QFO_EUKARYOTA_FOLDER,
    BUSCO_MAMMALIA_FOLDER,
    ORTHODB_FOLDER,
    SIMULATED_READS_FASTQ,
    SIMULATED_READS_BUSCO_ORTHODB_FOLDER
)

In [3]:
species_metadata = pd.read_csv(os.path.join(QFO_FOLDER, "species_metadata.csv"))
print(species_metadata.shape)
species_metadata.head()

(78, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
1,UP000000798,224324,AQUAE,1553,0,1557,Aquifex aeolicus (strain VF5),4290.0,Aquifex aeolicus,strain VF5
2,UP000006548,3702,ARATH,27475,14123,41920,Arabidopsis thaliana (Mouse-ear cress),1496.0,Arabidopsis thaliana,Mouse-ear cress
3,UP000001570,224308,BACSU,4260,7,4268,Bacillus subtilis (strain 168),4290.0,Bacillus subtilis,strain 168
4,UP000001414,226186,BACTN,4782,0,4823,Bacteroides thetaiotaomicron (strain ATCC 2914...,4290.0,Bacteroides thetaiotaomicron,strain ATCC 29148 / DSM 2079 / NCTC 10582 / E5...


In [4]:
orthodb_busco_mammalia = pd.read_csv(
    os.path.join(
        BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot__with_species.csv"
    )
)
print(orthodb_busco_mammalia.shape)
orthodb_busco_mammalia.head()

(64199, 6)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name,orthodb_id,species,species_name
0,10090_0:00000d,Q91X05,UniProt,150380at40674,10090,Mus musculus
1,10090_0:00000d,C0LL94,UniProt,150380at40674,10090,Mus musculus
2,10090_0:000012,Q9R1A8,UniProt,80051at40674,10090,Mus musculus
3,10090_0:000015,Q8BX09,UniProt,73776at40674,10090,Mus musculus
4,10090_0:000018,A9ZNB6,UniProt,194736at40674,10090,Mus musculus


## Subset orthodb to busco mammali

### If using later, simply read the csv

In [5]:
odb_xrefs_uniprot_busco_mammalia = pd.read_csv(
    os.path.join(BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot.csv")
)
print(odb_xrefs_uniprot_busco_mammalia.shape)
odb_xrefs_uniprot_busco_mammalia.head()

(64198, 3)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
0,10090_0:00000d,Q91X05,UniProt
1,10090_0:00000d,C0LL94,UniProt
2,10090_0:000012,Q9R1A8,UniProt
3,10090_0:000015,Q8BX09,UniProt
4,10090_0:000018,A9ZNB6,UniProt


# Subset input reads (human fastq gz) to only BUSCO mammalia ids

In [6]:
busco_mammalia_uniprot_ids = set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values)
len(busco_mammalia_uniprot_ids)

64197

## Read in simulated human reads and subset to only ones with busco IDs

In [7]:
simulated_reads_busco_mammalia_records = []

with screed.open(SIMULATED_READS_FASTQ) as records:
    for record in tqdm(records):
        name = record['name']
        read_id_uniprot_ensembl = name.split(';')[0]
        uniprot_ensembl = read_id_uniprot_ensembl.split('/')[1]
        uniprot = read_id_uniprot_ensembl.split('|')[1]
        if uniprot in busco_mammalia_uniprot_ids:
            simulated_reads_busco_mammalia_records.append(record)
len(simulated_reads_busco_mammalia_records)

1015001it [00:12, 82569.32it/s]


445051

## WRite new human fastq gz 

In [8]:
fastq_output = os.path.join(
    SIMULATED_READS_BUSCO_ORTHODB_FOLDER, "Homo_sapiens_9606_qfo_dna_01.fq.gz"
)
with gzip.open(fastq_output, "wb") as f:
    for record in tqdm(simulated_reads_busco_mammalia_records):
        output_str = (
            f'@{record["name"]}\n{record["sequence"]}\n+\n{record["quality"]}\n'
        )
        output_bytes = bytes(output_str, "utf-8")
        f.write(output_bytes)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 445051/445051 [00:17<00:00, 26120.04it/s]


# Subset protein fasta files for only BUSCO mammalia ids

## Read human protein fasta


Human taxonomy id is 9606

In [9]:
ls -lha $QFO_EUKARYOTA_FOLDER/*9606*

-rw------- 1 olgabot  14M Jul 22 18:48 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606.fasta
-rw------- 1 olgabot 2.7M Jul 22 18:48 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606.gene2acc
-rw------- 1 olgabot  67M Jul 22 18:48 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606.idmapping
-rw------- 1 olgabot  34M Jul 22 18:47 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA.fasta
-rw------- 1 olgabot  20K Jul 22 18:48 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA.miss
-rw------- 1 olgabot  33M Jul 22 11:53 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA__startswit

In [10]:
human_fasta = os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606.fasta")
! head $human_fasta

>tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1
MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKK
SGKK
>sp|A0A024RBG1|NUD4B_HUMAN Diphosphoinositol polyphosphate phosphohydrolase NUDT4B OS=Homo sapiens OX=9606 GN=NUDT4B PE=3 SV=1
MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQWIVPGGGMEPEEEPG
GAAVREVYEEAGVKGKLGRLLGIFEQNQDRKHRTYVYVLTVTEILEDWEDSVNIGRKREW
FKVEDAIKVLQCHKPVHAEYLEKLKLGCSPANGNSTVPSLPDNNALFVTAAQTSGLPSSV
R
>tr|A0A075B6H5|A0A075B6H5_HUMAN T cell receptor beta variable 20/OR9-2 (non-functional) (Fragment) OS=Homo sapiens OX=9606 GN=TRBV20OR9-2 PE=4 SV=1
METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVICKSGTSVNIECRSLD


In [11]:
! grep 'sp|' $human_fasta | head

>sp|A0A024RBG1|NUD4B_HUMAN Diphosphoinositol polyphosphate phosphohydrolase NUDT4B OS=Homo sapiens OX=9606 GN=NUDT4B PE=3 SV=1
>sp|A0A075B6H9|LV469_HUMAN Immunoglobulin lambda variable 4-69 OS=Homo sapiens OX=9606 GN=IGLV4-69 PE=1 SV=1
>sp|A0A075B6I0|LV861_HUMAN Immunoglobulin lambda variable 8-61 OS=Homo sapiens OX=9606 GN=IGLV8-61 PE=3 SV=7
>sp|A0A075B6I1|LV460_HUMAN Immunoglobulin lambda variable 4-60 OS=Homo sapiens OX=9606 GN=IGLV4-60 PE=3 SV=1
>sp|A0A075B6I4|LVX54_HUMAN Immunoglobulin lambda variable 10-54 OS=Homo sapiens OX=9606 GN=IGLV10-54 PE=3 SV=1
>sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4
>sp|A0A075B6J1|LV537_HUMAN Immunoglobulin lambda variable 5-37 OS=Homo sapiens OX=9606 GN=IGLV5-37 PE=3 SV=1
>sp|A0A075B6J6|LV322_HUMAN Immunoglobulin lambda variable 3-22 OS=Homo sapiens OX=9606 GN=IGLV3-22 PE=3 SV=1
>sp|A0A075B6J9|LV218_HUMAN Immunoglobulin lambda variable 2-18 OS=Homo sapiens OX=9606 GN=IGLV2-18 PE=3 SV=2

In [12]:
! grep P61981 $human_fasta

>sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens OX=9606 GN=YWHAG PE=1 SV=2


## Get Human Uniprot IDs from QFO that are also present in the BUSCO Mapping

In [13]:

def read_ids_filter_orthodb(filename):
    df = pd.read_csv(
        filename, sep="\t", header=None, names=["uniprot_id", "id_type", "db_id"]
    )
    df.columns = "source__" + df.columns

    df = df.query('source__id_type == "OrthoDB"')
    return df


human_ids_busco = read_ids_filter_orthodb(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606.idmapping")
)
print(human_ids_busco.shape)
human_ids_busco.head()

(22539, 3)


Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
102,P61981,OrthoDB,1176818at2759
200,P27348,OrthoDB,1176818at2759
300,P30443,OrthoDB,1390181at2759
357,Q96QU6,OrthoDB,1156861at2759
786,P10321,OrthoDB,1390181at2759


In [14]:
csv = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "busco_mammalia_human_uniprot_ids_in_qfo.csv",
)

human_ids_busco.to_csv(
    csv,
    index=False,
)

In [15]:
2+1

3


### How many fasta entries in the human proteome fasta?

In [16]:
! grep -c '>' $human_fasta

20874


## Perform subsetting of QFO fasta files from uniprot IDs present in BUSCO

In [17]:
def subset_by_uniprot_ids(fasta, uniprot_ids):
    records_subset = []
    with screed.open(fasta) as records:
        for record in records:
            name = record["name"]
            record_id = name.split()[0]
            uniprot_id = record_id.split("|")[1]
            if uniprot_id in uniprot_ids:
                records_subset.append(record)
    return records_subset


def write_fasta(output_fasta, records):
    with open(output_fasta, "w") as f:
        for record in records:
            f.write(">{name}\n{sequence}\n".format(**record))


# human fasta = protein sequences from quest for orthologs
human_busco_mammalia_records = subset_by_uniprot_ids(
    human_fasta, set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values)
)
print(len(human_busco_mammalia_records))
human_busco_mammalia_records[:3]

8904


[{'name': 'sp|A0A087WTH1|TM265_HUMAN Transmembrane protein 265 OS=Homo sapiens OX=9606 GN=TMEM265 PE=3 SV=1', 'sequence': 'MEDEEKAVEILGNTEAAHPPSPIRCCWLRLRCLAATSIICGCSCLGVMALVFAIKAEERHKAGRSEEAVRWGARARKLILASFAVWLAVLILGPLLLWLLSYAIAQAE', 'description': ''},
 {'name': 'sp|A0A087WTH5|KCE1B_HUMAN Potassium voltage-gated channel subfamily E member 1B OS=Homo sapiens OX=9606 GN=KCNE1B PE=3 SV=2', 'sequence': 'MPRMILSNTTAVTPFLTKLWQETVQQGGNMSGLARRSPRSGDGKLEALYVLMVLGFFGFFTLGIMLSYIRSKKLEHSNDPFNVYIESNAWQEKDKAYVQARVLESYRSCYVVENHLAIEQPNTHLPETKPSP', 'description': ''},
 {'name': 'sp|A0A087WWA1|P3URF_HUMAN PIK3R3 upstream open reading frame protein OS=Homo sapiens OX=9606 GN=P3R3URF PE=4 SV=1', 'sequence': 'MGPSRLVRGPRPQGMRSPYRRPGMGWPRPRFPRMFKCSRRRYQQGLRGRTASSAAINPATRAMGINNTHTDTTIVWIFPPQVLRHLRQPGIFLIL', 'description': ''}]

In [18]:
write_fasta(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606__busco_mammlia_odbv10.fasta"),
    human_busco_mammalia_records,
)

## Mouse

### Get mouse proteome fasta from QfO
Mouse taxononmy id is `10090`

In [19]:
ls -lha $QFO_EUKARYOTA_FOLDER/*10090*

-rw-r--r-- 1 olgabot  14M Jul 25 12:46 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000000589_10090.fasta
-rw-r--r-- 1 olgabot  44M Jul 25 12:45 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000000589_10090.idmapping
-rw-r--r-- 1 olgabot 5.9M Jul 25 12:47 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000000589_10090__busco_mammlia_odbv10.fasta


In [20]:
! head $QFO_EUKARYOTA_FOLDER/UP000000589_10090_DNA.fasta

head: cannot open '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/quest-for-orthologs/2019/Eukaryota/UP000000589_10090_DNA.fasta' for reading: No such file or directory


In [21]:
mouse_ids_busco = read_ids_filter_orthodb(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000000589_10090.idmapping")
)
print(mouse_ids_busco.shape)
mouse_ids_busco.head()

(19931, 3)


Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
51,Q9CQV8,OrthoDB,EOG091G0VKY
115,O70456,OrthoDB,EOG091G0VKY
212,P63101,OrthoDB,EOG091G0VKY
259,A2AIG8,OrthoDB,EOG091G04H3
327,Q6ZWR4,OrthoDB,EOG091G09BB


### Subset mouse protein fasta on BUSCO Mammalia UniProt IDs

In [22]:
mouse_fasta = os.path.join(QFO_EUKARYOTA_FOLDER, 'UP000000589_10090.fasta')
mouse_busco_mammalia_records = subset_by_uniprot_ids(mouse_fasta, set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values))
print(len(mouse_busco_mammalia_records))
mouse_busco_mammalia_records[:3]

8670


[{'name': 'tr|A0A087WP46|A0A087WP46_MOUSE cDNA sequence BC034090 OS=Mus musculus OX=10090 GN=BC034090 PE=1 SV=1', 'sequence': 'MEGMEAAAKPARRSQASRPGSTTSPTQVTPAMARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPSVLESKVKALKEKMTAGKQGTDPRPTSYERPSPTKSKCHQVKTGVAWSLPDALVVPHAQNPNDGHLARHVNEKKPARNSGSKPSTPESWNEQSPWSPEAVWMPADHEEDPVAGSGSLQESPNNRVSAVQPQRPGPCKTTPLSSQKKRRQYPLGDGIVTKEDLDSTTLTSKEDLIPRTDQPETFWRAQGLEALGSVANALSLSDRVERNRLLLQEILKVSRQSPPKAGSPDWTSWDRDASERPAGDVDWDSGTPQQDSGQSRTFVPKLEPTLSARHEEAKNLLRRARMKAKTQPLRASHDIVPTIAQGCRNGQRSPAPELRTTSAYRENLQNGNLNDPSSIESSNGQWPKQEMPLSHVRFEDESAHEAEFRYLDRLQQRQRQVLSTVLHAVDQGPLRSKPDLTNYINRIVGNSSFHRAVGCLDQSNFPVPPPPWDNERKCPACGSCLEERCPAEERAASDLRVLRSLQAACEAEAVLLGPCNSHGLSSPFPGLHTEWIRETHITDTVATHPEEEEDSALDSTHSSDSWKDGTDARTSQSSRAGEQIRVSSPQQRWHGSRPQGGPRWSRKAEAELPCGLQAWSHLPQLDDGVVGGEGREASGHIPQGTLFPEEDAVPKPALEPKRPWSQGQLGPRSGSHCAHPEDCRSPCRTAYAVPFSKTRGSSGSGQPPDQVPESHESLKTLCTSPLQRSHEEPSAPLPASQSTLTLPEEVPTPPSLRKSLCPMPPRKSVQKGHHWQEHQAEHMDSPLPVSPPRTVVLTRPQPQPCSPQVKHPLLDLFNNNSSSSIPLGLQGPSGVA

In [23]:
mouse_new_fasta = os.path.join(
    QFO_EUKARYOTA_FOLDER, "UP000000589_10090__busco_mammlia_odbv10.fasta"
)
write_fasta(
    mouse_new_fasta,
    mouse_busco_mammalia_records,
)

In [24]:
! head $mouse_new_fasta

>tr|A0A087WP46|A0A087WP46_MOUSE cDNA sequence BC034090 OS=Mus musculus OX=10090 GN=BC034090 PE=1 SV=1
MEGMEAAAKPARRSQASRPGSTTSPTQVTPAMARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPSVLESKVKALKEKMTAGKQGTDPRPTSYERPSPTKSKCHQVKTGVAWSLPDALVVPHAQNPNDGHLARHVNEKKPARNSGSKPSTPESWNEQSPWSPEAVWMPADHEEDPVAGSGSLQESPNNRVSAVQPQRPGPCKTTPLSSQKKRRQYPLGDGIVTKEDLDSTTLTSKEDLIPRTDQPETFWRAQGLEALGSVANALSLSDRVERNRLLLQEILKVSRQSPPKAGSPDWTSWDRDASERPAGDVDWDSGTPQQDSGQSRTFVPKLEPTLSARHEEAKNLLRRARMKAKTQPLRASHDIVPTIAQGCRNGQRSPAPELRTTSAYRENLQNGNLNDPSSIESSNGQWPKQEMPLSHVRFEDESAHEAEFRYLDRLQQRQRQVLSTVLHAVDQGPLRSKPDLTNYINRIVGNSSFHRAVGCLDQSNFPVPPPPWDNERKCPACGSCLEERCPAEERAASDLRVLRSLQAACEAEAVLLGPCNSHGLSSPFPGLHTEWIRETHITDTVATHPEEEEDSALDSTHSSDSWKDGTDARTSQSSRAGEQIRVSSPQQRWHGSRPQGGPRWSRKAEAELPCGLQAWSHLPQLDDGVVGGEGREASGHIPQGTLFPEEDAVPKPALEPKRPWSQGQLGPRSGSHCAHPEDCRSPCRTAYAVPFSKTRGSSGSGQPPDQVPESHESLKTLCTSPLQRSHEEPSAPLPASQSTLTLPEEVPTPPSLRKSLCPMPPRKSVQKGHHWQEHQAEHMDSPLPVSPPRTVVLTRPQPQPCSPQVKHPLLDLFNNNSSSSIPLGLQGPSGVAVHRNRSEKDQCCQEPVLPLESNGDG