# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import gzip 
import json
import os
import glob


import pandas as pd
import screed
from tqdm import tqdm
# import seaborn as sns

# %matplotlib inline

In [3]:
from path_constants import (
    QFO_FOLDER,
    QFO_EUKARYOTA_FOLDER,
    BUSCO_MAMMALIA_FOLDER,
    ORTHODB_FOLDER,
    SIMULATED_READS_FOLDER,
)

In [4]:
species_metadata = pd.read_csv(os.path.join(QFO_FOLDER, "species_metadata.csv"))
print(species_metadata.shape)
species_metadata.head()

(78, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
1,UP000000798,224324,AQUAE,1553,0,1557,Aquifex aeolicus (strain VF5),4290.0,Aquifex aeolicus,strain VF5
2,UP000006548,3702,ARATH,27475,14123,41920,Arabidopsis thaliana (Mouse-ear cress),1496.0,Arabidopsis thaliana,Mouse-ear cress
3,UP000001570,224308,BACSU,4260,7,4268,Bacillus subtilis (strain 168),4290.0,Bacillus subtilis,strain 168
4,UP000001414,226186,BACTN,4782,0,4823,Bacteroides thetaiotaomicron (strain ATCC 2914...,4290.0,Bacteroides thetaiotaomicron,strain ATCC 29148 / DSM 2079 / NCTC 10582 / E5...


## All mammals are within 200 million years (mya) of divergence

In [5]:
species_metadata.query('divergence_from_human_mya < 200')

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
11,UP000002254,9615,CANLF,20271,5269,25631,Canis lupus familiaris (Dog) (Canis familiaris),96.0,Canis lupus familiaris,Dog
28,UP000001519,9595,GORGO,21795,22932,44943,Gorilla gorilla gorilla (Western lowland gorilla),9.06,Gorilla gorilla gorilla,Western lowland gorilla
32,UP000005640,9606,HUMAN,21146,74769,96332,Homo sapiens (Human),0.0,Homo sapiens,Human
40,UP000002280,13616,MONDO,21276,968,22319,Monodelphis domestica (Gray short-tailed opossum),159.0,Monodelphis domestica,Gray short-tailed opossum
42,UP000000589,10090,MOUSE,22287,40369,63134,Mus musculus (Mouse),90.0,Mus musculus,Mouse
52,UP000002277,9598,PANTR,23006,25788,49266,Pan troglodytes (Chimpanzee),6.7,Pan troglodytes,Chimpanzee
60,UP000002494,10116,RAT,21678,9885,32629,Rattus norvegicus (Rat),90.0,Rattus norvegicus,Rat


# Read BUSCO + OrthoDB info

In [6]:
! ls -lha $BUSCO_MAMMALIA_FOLDER

total 12M
drwxr-xr-x 2 olga czb 4.0K Oct 14 17:33 .
drwxr-xr-x 3 olga czb 4.0K Oct 14 16:37 ..
-rw-r--r-- 1 olga czb 1.9M Oct 14 17:33 busco_mammalia__orthodb__to__uniprot.csv
-rw-r--r-- 1 olga czb 4.1M Oct 14 17:16 busco_mammalia__orthodb__to__uniprot__with_species.csv
-rwxr-xr-x 1 olga czb 5.9M Oct 14 16:37 ogs.id.info
-rwxr-xr-x 1 olga czb  602 Oct 14 16:37 species.info


## Read species info

In [7]:
busco_mammalia_species = pd.read_csv(
    os.path.join(BUSCO_MAMMALIA_FOLDER, "species.info"),
    sep="\t",
    index_col=0,
    names=["tax_id", "species_name"],
    squeeze=True,
)
print(busco_mammalia_species.shape)
busco_mammalia_species.head()

(24,)


tax_id
1026970     Nannospalax galili
246437        Tupaia chinensis
9837        Camelus bactrianus
34839      Chinchilla lanigera
42254            Sorex araneus
Name: species_name, dtype: object

In [8]:
busco_mammalia_species.dtypes

dtype('O')

## Read BUSCO Mammalia orthogroups info, containing both BUSCO + OrthoDB IDs

In [9]:
busco_mammalia_orthogroups = pd.read_csv(
    os.path.join(BUSCO_MAMMALIA_FOLDER, "ogs.id.info"),
    sep="\t",
    header=None,
    names=["busco_id", "orthodb_id"],
)
print(busco_mammalia_orthogroups.shape)
busco_mammalia_orthogroups.head()

(218112, 2)


Unnamed: 0,busco_id,orthodb_id
0,34839_0:0032fe,6011at40674
1,118797_0:00467b,6011at40674
2,29078_0:0007f1,6011at40674
3,9986_0:00087d,6011at40674
4,9837_0:000f1d,6011at40674


### Extract taxonomy id from busco ID

In [10]:
busco_mammalia_orthogroups["species"] = (
    busco_mammalia_orthogroups.busco_id.str.split("_").str[0].astype(int)
)
busco_mammalia_orthogroups.head()

Unnamed: 0,busco_id,orthodb_id,species
0,34839_0:0032fe,6011at40674,34839
1,118797_0:00467b,6011at40674,118797
2,29078_0:0007f1,6011at40674,29078
3,9986_0:00087d,6011at40674,9986
4,9837_0:000f1d,6011at40674,9837


### Join on taxonomy id to get scientific name from BUSCO `species.info` file

In [11]:
busco_mammalia_orthogroups_species = busco_mammalia_orthogroups.join(
    busco_mammalia_species, on="species"
)
print(busco_mammalia_orthogroups_species.shape)
busco_mammalia_orthogroups_species.head()

(218112, 4)


Unnamed: 0,busco_id,orthodb_id,species,species_name
0,34839_0:0032fe,6011at40674,34839,Chinchilla lanigera
1,118797_0:00467b,6011at40674,118797,Lipotes vexillifer
2,29078_0:0007f1,6011at40674,29078,Eptesicus fuscus
3,9986_0:00087d,6011at40674,9986,Oryctolagus cuniculus
4,9837_0:000f1d,6011at40674,9837,Camelus bactrianus


In [12]:
busco_mammalia_orthogroups_species.species_name.value_counts()

Enhydra lutris                    9191
Ceratotherium simum simum         9187
Peromyscus maniculatus bairdii    9186
Rhinolophus sinicus               9185
Panthera pardus                   9183
Tupaia chinensis                  9179
Nannospalax galili                9178
Mus musculus                      9177
Capra hircus                      9173
Chrysochloris asiatica            9167
Aotus nancymaae                   9165
Camelus bactrianus                9161
Chinchilla lanigera               9160
Eptesicus fuscus                  9158
Homo sapiens                      9157
Rousettus aegyptiacus             9151
Marmota marmota marmota           9149
Macaca mulatta                    9147
Lipotes vexillifer                9119
Erinaceus europaeus               9113
Sorex araneus                     9098
Phascolarctos cinereus            9053
Oryctolagus cuniculus             8874
Ornithorhynchus anatinus          7801
Name: species_name, dtype: int64

In [13]:
busco_mammalia_orthogroups_species.species_name.value_counts().sort_index()

Aotus nancymaae                   9165
Camelus bactrianus                9161
Capra hircus                      9173
Ceratotherium simum simum         9187
Chinchilla lanigera               9160
Chrysochloris asiatica            9167
Enhydra lutris                    9191
Eptesicus fuscus                  9158
Erinaceus europaeus               9113
Homo sapiens                      9157
Lipotes vexillifer                9119
Macaca mulatta                    9147
Marmota marmota marmota           9149
Mus musculus                      9177
Nannospalax galili                9178
Ornithorhynchus anatinus          7801
Oryctolagus cuniculus             8874
Panthera pardus                   9183
Peromyscus maniculatus bairdii    9186
Phascolarctos cinereus            9053
Rhinolophus sinicus               9185
Rousettus aegyptiacus             9151
Sorex araneus                     9098
Tupaia chinensis                  9179
Name: species_name, dtype: int64

### How many of the BUSCO species overlap with QFO species?

In [14]:
rows = busco_mammalia_orthogroups_species.species_name.isin(
    species_metadata.scientific_name
)
busco_mammalia_orthogroups_species.loc[rows].species_name.value_counts()

Mus musculus    9177
Homo sapiens    9157
Name: species_name, dtype: int64

## [this takes a while] Get mapping OrthoDB IDs to UniProt (cross-references or xrefs)
Don't need to decompress/ungzip, pandas will do that for us. Need UniProt IDs because that's the ID that's present in the protein fasta file

In [15]:
%%time

odb_xrefs = pd.read_csv(
    os.path.join(ORTHODB_FOLDER, "odb10v1_gene_xrefs.tab.gz"),
    sep="\t",
    header=None,
    names=["orthodb_gene_id", "external_db_gene_id", "external_db_name"],
)
print(odb_xrefs.shape)
odb_xrefs.head()

(241929290, 3)
CPU times: user 2min 39s, sys: 10.4 s, total: 2min 49s
Wall time: 2min 49s


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
0,1000373_0:000000,374504755,NCBIproteinGI
1,1000373_0:000001,374504763,NCBIproteinGI
2,1000373_0:000002,374504767,NCBIproteinGI
3,1000373_0:000003,374504758,NCBIproteinGI
4,1000373_1:000000,RnQV1s4_gp1,NCBIgenename


In [16]:
odb_xrefs.external_db_name.value_counts()

InterPro          83329152
NCBIproteinAcc    36509095
NCBIproteinGI     25640331
GOterm            25426254
UniProt           23196410
ENSEMBL           19931366
NCBIgid           14246104
NCBIgenename      13650578
Name: external_db_name, dtype: int64

In [17]:
odb_xrefs_uniprot = odb_xrefs.query('external_db_name == "UniProt"')
print(odb_xrefs_uniprot.shape)
odb_xrefs_uniprot.head()

(23196410, 3)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
3359,1000565_0:000000,F5RH80,UniProt
3367,1000565_0:000001,F5RH81,UniProt
3372,1000565_0:000002,F5RH82,UniProt
3376,1000565_0:000003,F5RH83,UniProt
3389,1000565_0:000004,F5RH84,UniProt


## Get only OrthoDB xref UNIPROT IDs for BUSCO Mammalia

In [18]:
busco_mammalia_orthogroups.head()

Unnamed: 0,busco_id,orthodb_id,species
0,34839_0:0032fe,6011at40674,34839
1,118797_0:00467b,6011at40674,118797
2,29078_0:0007f1,6011at40674,29078
3,9986_0:00087d,6011at40674,9986
4,9837_0:000f1d,6011at40674,9837


In [19]:
busco_mammalia_orthogroups_species.head()

Unnamed: 0,busco_id,orthodb_id,species,species_name
0,34839_0:0032fe,6011at40674,34839,Chinchilla lanigera
1,118797_0:00467b,6011at40674,118797,Lipotes vexillifer
2,29078_0:0007f1,6011at40674,29078,Eptesicus fuscus
3,9986_0:00087d,6011at40674,9986,Oryctolagus cuniculus
4,9837_0:000f1d,6011at40674,9837,Camelus bactrianus


In [20]:
busco_mammalia_orthogroups_species_index = busco_mammalia_orthogroups_species.set_index('busco_id')
busco_mammalia_orthogroups_species_index.head()

Unnamed: 0_level_0,orthodb_id,species,species_name
busco_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34839_0:0032fe,6011at40674,34839,Chinchilla lanigera
118797_0:00467b,6011at40674,118797,Lipotes vexillifer
29078_0:0007f1,6011at40674,29078,Eptesicus fuscus
9986_0:00087d,6011at40674,9986,Oryctolagus cuniculus
9837_0:000f1d,6011at40674,9837,Camelus bactrianus


In [21]:
odb_xrefs_uniprot.head()

Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
3359,1000565_0:000000,F5RH80,UniProt
3367,1000565_0:000001,F5RH81,UniProt
3372,1000565_0:000002,F5RH82,UniProt
3376,1000565_0:000003,F5RH83,UniProt
3389,1000565_0:000004,F5RH84,UniProt


In [22]:
%%time

odb_xrefs_uniprot_busco_mammalia_species = odb_xrefs_uniprot.join(
    busco_mammalia_orthogroups_species_index, 
    'orthodb_gene_id', 
    how='inner'
)
print(odb_xrefs_uniprot_busco_mammalia_species.shape)
odb_xrefs_uniprot_busco_mammalia_species.head()

(64199, 6)
CPU times: user 19.4 s, sys: 392 ms, total: 19.8 s
Wall time: 19.8 s


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name,orthodb_id,species,species_name
2401526,10090_0:00000d,Q91X05,UniProt,150380at40674,10090,Mus musculus
2401527,10090_0:00000d,C0LL94,UniProt,150380at40674,10090,Mus musculus
2401670,10090_0:000012,Q9R1A8,UniProt,80051at40674,10090,Mus musculus
2401732,10090_0:000015,Q8BX09,UniProt,73776at40674,10090,Mus musculus
2401801,10090_0:000018,A9ZNB6,UniProt,194736at40674,10090,Mus musculus


### WRite to file!!!

In [23]:
odb_xrefs_uniprot_busco_mammalia_species.to_csv(
    os.path.join(
        BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot__with_species.csv"
    ),
    index=False,
)

In [24]:
orthodb_busco_mammalia = pd.read_csv(
    os.path.join(
        BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot__with_species.csv"
    )
)
print(orthodb_busco_mammalia.shape)
orthodb_busco_mammalia.head()

(64199, 6)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name,orthodb_id,species,species_name
0,10090_0:00000d,Q91X05,UniProt,150380at40674,10090,Mus musculus
1,10090_0:00000d,C0LL94,UniProt,150380at40674,10090,Mus musculus
2,10090_0:000012,Q9R1A8,UniProt,80051at40674,10090,Mus musculus
3,10090_0:000015,Q8BX09,UniProt,73776at40674,10090,Mus musculus
4,10090_0:000018,A9ZNB6,UniProt,194736at40674,10090,Mus musculus


## Subset orthodb to busco mammali

In [25]:
busco_mammalia_orthogroups.head()

Unnamed: 0,busco_id,orthodb_id,species
0,34839_0:0032fe,6011at40674,34839
1,118797_0:00467b,6011at40674,118797
2,29078_0:0007f1,6011at40674,29078
3,9986_0:00087d,6011at40674,9986
4,9837_0:000f1d,6011at40674,9837


In [26]:
odb_xrefs_uniprot.head()

Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
3359,1000565_0:000000,F5RH80,UniProt
3367,1000565_0:000001,F5RH81,UniProt
3372,1000565_0:000002,F5RH82,UniProt
3376,1000565_0:000003,F5RH83,UniProt
3389,1000565_0:000004,F5RH84,UniProt


In [27]:
odb_xrefs_uniprot_busco_mammalia = odb_xrefs_uniprot.query(
    "orthodb_gene_id in @busco_mammalia_orthogroups.busco_id"
)
print(odb_xrefs_uniprot_busco_mammalia.shape)
odb_xrefs_uniprot_busco_mammalia.head()

(64198, 3)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
2401526,10090_0:00000d,Q91X05,UniProt
2401527,10090_0:00000d,C0LL94,UniProt
2401670,10090_0:000012,Q9R1A8,UniProt
2401732,10090_0:000015,Q8BX09,UniProt
2401801,10090_0:000018,A9ZNB6,UniProt


In [28]:
odb_xrefs_uniprot_busco_mammalia.sample(10)

Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
2757392,10090_0:00343f,Q80TN5,UniProt
2533638,10090_0:001341,Q8R3G1,UniProt
228230370,9365_0:000624,A0A1S3WQE2,UniProt
2427408,10090_0:0003c8,Q3UI57,UniProt
231485434,9544_0:003cff,H9FND0,UniProt
2502961,10090_0:000ebe,Q3TUE1,UniProt
233938799,9606_0:000fa3,C9JLB7,UniProt
241447783,9986_0:003a4b,G1SSX7,UniProt
228307330,9365_0:00244f,A0A1S2ZW29,UniProt
2530138,10090_0:0012ba,G3UXC1,UniProt


### Write to csv for later!

In [29]:
odb_xrefs_uniprot_busco_mammalia.to_csv(
    os.path.join(BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot.csv"),
    index=False,
)

### If using later, simply read the csv

In [30]:
odb_xrefs_uniprot_busco_mammalia = pd.read_csv(
    os.path.join(BUSCO_MAMMALIA_FOLDER, "busco_mammalia__orthodb__to__uniprot.csv")
)
print(odb_xrefs_uniprot_busco_mammalia.shape)
odb_xrefs_uniprot_busco_mammalia.head()

(64198, 3)


Unnamed: 0,orthodb_gene_id,external_db_gene_id,external_db_name
0,10090_0:00000d,Q91X05,UniProt
1,10090_0:00000d,C0LL94,UniProt
2,10090_0:000012,Q9R1A8,UniProt
3,10090_0:000015,Q8BX09,UniProt
4,10090_0:000018,A9ZNB6,UniProt


# Subset input reads (human fastq gz) to only BUSCO mammalia ids

In [31]:
! zcat $SIMULATED_READS_FOLDER/Homo_sapiens_9606_qfo_dna_01.fq.gz | head

@read1/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1
ATGTCCAGCCACGAAGGTGGCAAGAAGAAGGCACTGAAACAGCCCAAGAAGCAGGCCAAGGAGATGGACGAGGAAGAGAAGGCTTTCAAGCAGAAACAAAAAGAGGAGCAGAAGAAACTCGAGGTGCTAAAAGCGAAGGTCGTGGGGAAG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read2/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1
ATTTCCAGCCACGAAGGTGGCAAGAAGAAGGCACTGAAACAGCCCAAGAAGCAGGCCAAGGAGATGGACGAGGAAGAGAAGGCTTTCAAGCAGAAACAAAAAGAGGAGCAGAAGAAACTCGAGGTGCTAAAAGCGAAGGTCGTGGGGAAG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@read3/tr|A0A024R1R8|ENSP00000491117;mate1Start:1;mate2Start:1
ATGTCCAGCCACGAAGGTGGCAAGAAGAAGGCACTGAAACAGCCCAAGAAGCNGGCCAAGGAGATGGACGAGGAAGAGAAGGCTTTCAAGCAGAAACAAAAAGAGGAGCAGAAGAAACTCGAGGTGCTAAAAGCGAAGGTCGTGGGGAAG

gzip: stdout: Broken pipe


In [32]:
fastq_subset_outdir = os.path.join(
    SIMULATED_READS_FOLDER, "reads_from_genes_in_busco_mammalia__orthodb-v10"
)

In [33]:
busco_mammalia_uniprot_ids = set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values)
len(busco_mammalia_uniprot_ids)

64197

## Read in simulated human reads and subset to only ones with busco IDs

In [34]:
fastq_in = '/home/olga/data_lg/data_sm_copy/immune-evolution/rawdata/simulated/human/Homo_sapiens_9606_qfo_dna_01.fq.gz'

simulated_reads_busco_mammalia_records = []

with screed.open(fastq_in) as records:
    for record in tqdm(records):
        name = record['name']
        read_id_uniprot_ensembl = name.split(';')[0]
        uniprot_ensembl = read_id_uniprot_ensembl.split('/')[1]
        uniprot = read_id_uniprot_ensembl.split('|')[1]
        if uniprot in busco_mammalia_uniprot_ids:
            simulated_reads_busco_mammalia_records.append(record)
len(simulated_reads_busco_mammalia_records)

1015001it [00:15, 63906.68it/s]


445051

## WRite new human fastq gz 

In [35]:
fastq_outdir = '/home/olga/data_lg/data_sm_copy/immune-evolution/rawdata/simulated/human/busco_mammalia__orthodb-v10'

In [36]:
fastq_output = f'{fastq_outdir}/Homo_sapiens_9606_qfo_dna_01.fq.gz'
with gzip.open(fastq_output, 'wb') as f:
    for record in tqdm(simulated_reads_busco_mammalia_records):
        output_str = f'@{record["name"]}\n{record["sequence"]}\n+\n{record["quality"]}\n'
        output_bytes = bytes(output_str, 'utf-8')
        f.write(output_bytes)

100%|██████████| 445051/445051 [00:17<00:00, 25436.45it/s]


# Subset protein fasta files for only BUSCO mammalia ids

## Read human protein fasta


Human taxonomy id is 9606

In [40]:
ls -lha $QFO_EUKARYOTA_FOLDER/*9606*

-rw-r--r-- 1 olga czb  36M Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_additional.fasta
-rw-r--r-- 1 olga czb  34M Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA.fasta
-rw-r--r-- 1 olga czb  20K Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA.miss
-rw-r--r-- 1 olga czb  33M Oct 13 18:51 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_DNA__startswith_atg_and_protein_startswith_m.fasta
-rw-r--r-- 1 olga czb  14M Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606.fasta
-rw-r--r-- 1 olga czb  50K Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606_gcoord.miss
-rw-r--r-- 1 olga czb 2.7M Oct 13 12:59 ../kmer-homology-data/00--rawdata/quest-for-orthologs/2019/Eukaryota/UP000005640_9606.gene2acc
-rw-

In [41]:
human_fasta = os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606.fasta")
! head $human_fasta

>tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1
MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKK
SGKK
>sp|A0A024RBG1|NUD4B_HUMAN Diphosphoinositol polyphosphate phosphohydrolase NUDT4B OS=Homo sapiens OX=9606 GN=NUDT4B PE=3 SV=1
MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQWIVPGGGMEPEEEPG
GAAVREVYEEAGVKGKLGRLLGIFEQNQDRKHRTYVYVLTVTEILEDWEDSVNIGRKREW
FKVEDAIKVLQCHKPVHAEYLEKLKLGCSPANGNSTVPSLPDNNALFVTAAQTSGLPSSV
R
>tr|A0A075B6H5|A0A075B6H5_HUMAN T cell receptor beta variable 20/OR9-2 (non-functional) (Fragment) OS=Homo sapiens OX=9606 GN=TRBV20OR9-2 PE=4 SV=1
METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVICKSGTSVNIECRSLD


In [42]:
! grep 'sp|' $human_fasta | head

>sp|A0A024RBG1|NUD4B_HUMAN Diphosphoinositol polyphosphate phosphohydrolase NUDT4B OS=Homo sapiens OX=9606 GN=NUDT4B PE=3 SV=1
>sp|A0A075B6H9|LV469_HUMAN Immunoglobulin lambda variable 4-69 OS=Homo sapiens OX=9606 GN=IGLV4-69 PE=1 SV=1
>sp|A0A075B6I0|LV861_HUMAN Immunoglobulin lambda variable 8-61 OS=Homo sapiens OX=9606 GN=IGLV8-61 PE=3 SV=7
>sp|A0A075B6I1|LV460_HUMAN Immunoglobulin lambda variable 4-60 OS=Homo sapiens OX=9606 GN=IGLV4-60 PE=3 SV=1
>sp|A0A075B6I4|LVX54_HUMAN Immunoglobulin lambda variable 10-54 OS=Homo sapiens OX=9606 GN=IGLV10-54 PE=3 SV=1
>sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4
>sp|A0A075B6J1|LV537_HUMAN Immunoglobulin lambda variable 5-37 OS=Homo sapiens OX=9606 GN=IGLV5-37 PE=3 SV=1
>sp|A0A075B6J6|LV322_HUMAN Immunoglobulin lambda variable 3-22 OS=Homo sapiens OX=9606 GN=IGLV3-22 PE=3 SV=1
>sp|A0A075B6J9|LV218_HUMAN Immunoglobulin lambda variable 2-18 OS=Homo sapiens OX=9606 GN=IGLV2-18 PE=3 SV=2

In [None]:
! grep P61981 $fasta

## Get Human Uniprot IDs from QFO that are also present in the BUSCO Mapping

In [None]:

def read_ids_filter_orthodb(filename):
    df = pd.read_csv(
        filename, sep="\t", header=None, names=["uniprot_id", "id_type", "db_id"]
    )
    df.columns = "source__" + df.columns

    df = df.query('source__id_type == "OrthoDB"')
    return df


human_ids_busco = read_ids_filter_orthodb(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606.idmapping")
)
print(human_ids_busco.shape)
human_ids_busco.head()

In [None]:
csv = os.path.join(
    QFO_EUKARYOTA_FOLDER,
    "busco_mammalia_human_uniprot_ids_in_qfo.csv",
)

human_ids_busco.to_csv(
    csv,
    index=False,
)

In [None]:
2+1


### How many fasta entries in the human proteome fasta?

In [None]:
! grep -c '>' $human_fasta

## Perform subsetting of QFO fasta files from uniprot IDs present in BUSCO

In [None]:
def subset_by_uniprot_ids(fasta, uniprot_ids):
    records_subset = []
    with screed.open(fasta) as records:
        for record in records:
            name = record["name"]
            record_id = name.split()[0]
            uniprot_id = record_id.split("|")[1]
            if uniprot_id in uniprot_ids:
                records_subset.append(record)
    return records_subset


def write_fasta(output_fasta, records):
    with open(output_fasta, "w") as f:
        for record in records:
            f.write(">{name}\n{sequence}\n".format(**record))


# human fasta = protein sequences from quest for orthologs
human_busco_mammalia_records = subset_by_uniprot_ids(
    human_fasta, set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values)
)
print(len(human_busco_mammalia_records))
human_busco_mammalia_records[:3]

In [None]:
write_fasta(
    os.path.join(QFO_EUKARYOTA_FOLDER, "UP000005640_9606__busco_mammlia_odbv10.fasta"),
    human_busco_mammalia_records,
)

## Mouse

### Get mouse proteome fasta from QfO
Mouse taxononmy id is `10090`

In [None]:
ls -lha /home/olga/data_lg/data_sm_copy/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota/*10090*

In [None]:
! head /home/olga/data_lg/data_sm_copy/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota/UP000000589_10090_DNA.fasta

In [None]:
mouse_ids_busco = read_ids_filter_orthodb(f'{qfo_eukaryota}/UP000000589_10090.idmapping')
print(mouse_ids_busco.shape)
mouse_ids_busco.head()

### Subset mouse protein fasta on BUSCO Mammalia UniProt IDs

In [None]:
mouse_fasta = f'{qfo_eukaryota}/UP000000589_10090.fasta'
mouse_busco_mammalia_records = subset_by_uniprot_ids(mouse_fasta, set(odb_xrefs_uniprot_busco_mammalia.external_db_gene_id.values))
print(len(mouse_busco_mammalia_records))
mouse_busco_mammalia_records[:3]

In [None]:
write_fasta(f'{qfo_2019}/Eukaryota/UP000000589_10090__busco_mammlia_odbv10.fasta', 
            mouse_busco_mammalia_records)

In [None]:
! head '/home/olga/data_lg/data_sm_copy/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota/UP000000589_10090__busco_mammlia_odbv10.fasta'