In [None]:
from glob import iglob
import os

import pandas as pd
import screed
import seaborn as sns
from tqdm import tqdm

# Change to Quest for Orthologs 2019 data directory

In [2]:
cd ~/data_sm/kmer-hashing/quest-for-orthologs/data/2019/

/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019


In [3]:
ls -lha

total 2.7G
drwxr-xr-x 5 olga root 4.0K Jan 15 18:10 [0m[01;34m.[0m/
drwxr-xr-x 3 olga root 4.0K Dec 25 17:48 [01;34m..[0m/
drwxr-xr-x 5 olga czb  4.0K Dec 26 19:44 [01;34mArchaea[0m/
drwxr-xr-x 5 olga czb   16K Dec 26 19:44 [01;34mBacteria[0m/
drwxr-xr-x 9 olga czb   32K Jan 10 15:35 [01;34mEukaryota[0m/
-rw-r--r-- 1 olga czb  754K Jan 10 07:50 human_transcription_factors_with_uniprot_ids.csv
-rw-r--r-- 1 olga czb   68K Jan 10 07:50 [01;31mhuman_transcription_factors_with_uniprot_ids.csv.gz[0m
-rw-r--r-- 1 olga czb  133K Jan 10 07:50 human_transcription_factors_with_uniprot_ids.parquet
-rw-r--r-- 1 olga czb   76K Jan 15 10:57 human_visual_transduction_with_uniprot_ids.csv
-rw-r--r-- 1 olga czb   19K Jan 15 10:57 [01;31mhuman_visual_transduction_with_uniprot_ids.csv.gz[0m
-rw-r--r-- 1 olga czb   28K Jan 15 10:57 human_visual_transduction_with_uniprot_ids.parquet
-rw-r--r-- 1 olga czb   64M Jan 10 07:43 opisthokont_not_human_transcription_factors_ensembl_compara.csv
-rw-r-

# Download orthology and transcription factor data

## Read orthologous transcription factors 

In [4]:
visual = pd.read_csv('opisthokont_not_human_visual_system_ensembl_compara.csv')
print(visual.shape)
visual.head()

(17828, 16)


Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,target__perc_pos,target__protein_id,target__species,target__taxon_id,taxonomy_level,type
0,,ENSEMBL_PARALOGUES,ENSG00000198515,50.5929,62.8458,ENSP00000384264,homo_sapiens,9606,ENSG00000183862,57.8313,71.8373,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog
1,,ENSEMBL_PARALOGUES,ENSG00000198515,18.1818,35.9684,ENSP00000384264,homo_sapiens,9606,ENSG00000055118,11.9068,23.5548,ENSP00000262186,homo_sapiens,9606,Bilateria,other_paralog
2,,ENSEMBL_PARALOGUES,ENSG00000198515,17.3913,35.1779,ENSP00000384264,homo_sapiens,9606,ENSG00000143630,17.0543,34.4961,ENSP00000357342,homo_sapiens,9606,Bilateria,other_paralog
3,,ENSEMBL_PARALOGUES,ENSG00000198515,55.0725,67.0619,ENSP00000384264,homo_sapiens,9606,ENSG00000144191,60.2305,73.3429,ENSP00000272602,homo_sapiens,9606,Bilateria,other_paralog
4,,ENSEMBL_PARALOGUES,ENSG00000198515,24.7694,40.8432,ENSP00000384264,homo_sapiens,9606,ENSG00000170289,23.2386,38.3189,ENSP00000316605,homo_sapiens,9606,Bilateria,other_paralog


# Go to Quest for Orthologs fastas

## Read species metadata

In [5]:
species_metadata = pd.read_csv("species_metadata.csv")
print(species_metadata.shape)
species_metadata.head()

(78, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
1,UP000000798,224324,AQUAE,1553,0,1557,Aquifex aeolicus (strain VF5),4290.0,Aquifex aeolicus,strain VF5
2,UP000006548,3702,ARATH,27475,14123,41920,Arabidopsis thaliana (Mouse-ear cress),1496.0,Arabidopsis thaliana,Mouse-ear cress
3,UP000001570,224308,BACSU,4260,7,4268,Bacillus subtilis (strain 168),4290.0,Bacillus subtilis,strain 168
4,UP000001414,226186,BACTN,4782,0,4823,Bacteroides thetaiotaomicron (strain ATCC 2914...,4290.0,Bacteroides thetaiotaomicron,strain ATCC 29148 / DSM 2079 / NCTC 10582 / E5...


### Subset to opisthokonts

In [6]:
# Estimated opisthokonta divergence time from http://timetree.org/
t = 1105
opisthokonts = species_metadata.query('divergence_from_human_mya <= @t')
print(opisthokonts.shape)
opisthokonts.head()

(35, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [7]:
opisthokonts.query('scientific_name == "Homo sapiens"')

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
32,UP000005640,9606,HUMAN,21146,74769,96332,Homo sapiens (Human),0.0,Homo sapiens,Human


## Read Gene Accession file

```
Gene mapping files (*.gene2acc)
===============================

Column 1 is a unique gene symbol that is chosen with the following order of
preference from the annotation found in:
1) Model Organism Database (MOD)
2) Ensembl or Ensembl Genomes database
3) UniProt Ordered Locus Name (OLN)
4) UniProt Open Reading Frame (ORF)
5) UniProt Gene Name
A dash symbol ('-') is used when the gene encoding a protein is unknown.

Column 2 is the UniProtKB accession or isoform identifier for the given gene
symbol. This column may have redundancy when two or more genes have identical
translations.

Column 3 is the gene symbol of the canonical accession used to represent the
respective gene group and the first row of the sequence is the canonical one.
```

In [10]:

def read_gene2acc(gene2acc, names=['maybe_ensembl_id', 'uniprot_id', 'canonical_accession']):
    df = pd.read_csv(gene2acc, sep='\t', header=None, na_values='-', names=names)
    return df

gene2acc = read_gene2acc('Eukaryota/UP000005640_9606.gene2acc')
# gene2acc = pd.read_csv('Eukaryota/UP000005640_9606.gene2acc', sep='\t', header=None, na_values='-', names=columns)
print(gene2acc.shape)
gene2acc.head()

(96332, 3)


Unnamed: 0,maybe_ensembl_id,uniprot_id,canonical_accession
0,,A0A075B7B6,
1,,A0A075B714,
2,,A0A075B713,
3,,A0A075B712,
4,,A0A075B711,


In [11]:
gene2acc.dropna()

Unnamed: 0,maybe_ensembl_id,uniprot_id,canonical_accession
600,ACOT7L,Q6ZUV0,ACOT7L
601,BARGIN,Q6ZT62,BARGIN
602,BARGIN,Q6ZT62-2,BARGIN
603,BCE1,O60756,BCE1
604,C1orf140,Q5VVS0,C1orf140
...,...,...,...
96327,UNQ6190/PRO20217,Q6UXQ8,UNQ6190/PRO20217
96328,UNQ6493/PRO21345,Q6UXR8,UNQ6493/PRO21345
96329,UNQ6494/PRO21346,Q6UXR6,UNQ6494/PRO21346
96330,UNQ9165/PRO28630,Q6UXU0,UNQ9165/PRO28630


## Read ID mapping file

```
Database mapping files (*.idmapping)
====================================

These files contain mappings from UniProtKB to other databases for each
reference proteome.
The format consists of three tab-separated columns:

1. UniProtKB accession
2. ID_type:
   Database name as shown in UniProtKB cross-references and supported by the ID
   mapping tool on the UniProt web site (http://www.uniprot.org/mapping)
3. ID:
   Identifier in the cross-referenced database.

```

In [12]:
opisthokonts.head()

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [13]:
opisthokonts.query('proteome_id == "UP000000437"')

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
17,UP000000437,7955,DANRE,25939,21219,47517,Danio rerio (Zebrafish) (Brachydanio rerio),435.0,Danio rerio,Zebrafish


# Get ID Mapping for uniprot ids from ENSMBL

In [14]:
dfs = []

for filename in tqdm(sorted(iglob("Eukaryota/*.idmapping"))):
#     print(filename)
    basename = os.path.basename(filename)
    prefix = basename.split('.')[0]
    species_id, taxa_id = prefix.split("_")
#     print(f"{species_id=} {taxa_id=}")
    
    if species_id in opisthokonts.proteome_id.values:

        df = pd.read_csv(filename, sep='\t', header=None, names=['uniprot_id', 'id_type', 'db_id'])
        df['species_id'] = species_id
        df['taxa_id'] = species_id

        # Use only Ensembl data
#         df = df.query('id_type == "Ensembl"')
        print(df.shape)
        dfs.append(df)
    
id_mapping = pd.concat(dfs, ignore_index=True)
print(id_mapping.shape)
id_mapping.head()

  2%|▏         | 1/48 [00:00<00:40,  1.15it/s]

(1036083, 5)


  8%|▊         | 4/48 [00:01<00:24,  1.83it/s]

(555421, 5)
(149092, 5)


 10%|█         | 5/48 [00:01<00:18,  2.31it/s]

(141961, 5)


 12%|█▎        | 6/48 [00:02<00:27,  1.53it/s]

(1631036, 5)


 17%|█▋        | 8/48 [00:03<00:22,  1.80it/s]

(809657, 5)


 19%|█▉        | 9/48 [00:03<00:20,  1.86it/s]

(575653, 5)


 23%|██▎       | 11/48 [00:04<00:13,  2.75it/s]

(295804, 5)
(124774, 5)


 27%|██▋       | 13/48 [00:04<00:10,  3.29it/s]

(277055, 5)
(181143, 5)


 33%|███▎      | 16/48 [00:05<00:08,  3.57it/s]

(839553, 5)


 40%|███▉      | 19/48 [00:06<00:07,  4.12it/s]

(565651, 5)


 42%|████▏     | 20/48 [00:06<00:09,  3.06it/s]

(549426, 5)


 44%|████▍     | 21/48 [00:07<00:09,  2.78it/s]

(518273, 5)


 46%|████▌     | 22/48 [00:07<00:08,  3.17it/s]

(205999, 5)


 50%|█████     | 24/48 [00:07<00:07,  3.28it/s]

(765068, 5)
(146229, 5)


 54%|█████▍    | 26/48 [00:08<00:06,  3.51it/s]

(594466, 5)


 56%|█████▋    | 27/48 [00:09<00:09,  2.18it/s]

(1143307, 5)


 58%|█████▊    | 28/48 [00:09<00:08,  2.30it/s]

(424236, 5)


 62%|██████▎   | 30/48 [00:09<00:05,  3.20it/s]

(268306, 5)
(149870, 5)


 65%|██████▍   | 31/48 [00:10<00:06,  2.44it/s]

(873723, 5)


 67%|██████▋   | 32/48 [00:10<00:05,  2.88it/s]

(214092, 5)


 71%|███████   | 34/48 [00:12<00:06,  2.04it/s]

(2668934, 5)


 81%|████████▏ | 39/48 [00:12<00:02,  3.26it/s]

(340838, 5)
(151313, 5)


 83%|████████▎ | 40/48 [00:13<00:02,  3.26it/s]

(315939, 5)


 88%|████████▊ | 42/48 [00:13<00:01,  3.46it/s]

(637633, 5)


 90%|████████▉ | 43/48 [00:14<00:01,  3.48it/s]

(281658, 5)


 92%|█████████▏| 44/48 [00:14<00:01,  3.46it/s]

(307893, 5)


 94%|█████████▍| 45/48 [00:14<00:01,  2.60it/s]

(773011, 5)


 96%|█████████▌| 46/48 [00:15<00:00,  2.47it/s]

(561122, 5)


100%|██████████| 48/48 [00:15<00:00,  3.03it/s]

(390075, 5)





(19464294, 5)


Unnamed: 0,uniprot_id,id_type,db_id,species_id,taxa_id
0,Q5PRD0,UniProtKB-ID,143BA_DANRE,UP000000437,UP000000437
1,Q5PRD0,Gene_Name,ywhaba,UP000000437,UP000000437
2,Q5PRD0,Gene_Synonym,ywhab1,UP000000437,UP000000437
3,Q5PRD0,Gene_ORFName,wu:fb80c08,UP000000437,UP000000437
4,Q5PRD0,GI,82592598,UP000000437,UP000000437


# Merge id mapping with ensembl compara tfs

In [15]:
id_mapping_for_merging = id_mapping.copy()
id_mapping_for_merging.columns = "target__" + id_mapping_for_merging.columns
id_mapping_for_merging.head()

Unnamed: 0,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id
0,Q5PRD0,UniProtKB-ID,143BA_DANRE,UP000000437,UP000000437
1,Q5PRD0,Gene_Name,ywhaba,UP000000437,UP000000437
2,Q5PRD0,Gene_Synonym,ywhab1,UP000000437,UP000000437
3,Q5PRD0,Gene_ORFName,wu:fb80c08,UP000000437,UP000000437
4,Q5PRD0,GI,82592598,UP000000437,UP000000437


In [17]:
visual.shape

(17828, 16)

In [42]:
%%time

visual_uniprot_merge_proteins = visual.merge(id_mapping_for_merging, left_on='target__protein_id', right_on='target__db_id')
print(visual_uniprot_merge_proteins.shape)
visual_uniprot_merge_proteins.tail()

(1303, 21)
CPU times: user 12 s, sys: 712 ms, total: 12.7 s
Wall time: 12.7 s


Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,...,target__protein_id,target__species,target__taxon_id,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id
1298,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,82.4009,91.3753,ENSP00000360502,homo_sapiens,9606,ENSGALG00000006626,82.0186,...,ENSGALP00000010688,gallus_gallus,9031,Euteleostomi,ortholog_one2one,F1P429,Ensembl_PRO,ENSGALP00000010688,UP000000539,UP000000539
1299,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,76.5734,87.8788,ENSP00000360502,homo_sapiens,9606,ENSXETG00000017515,75.6912,...,ENSXETP00000046201,xenopus_tropicalis,8364,Gnathostomata,ortholog_one2one,Q28GF9,Ensembl_PRO,ENSXETP00000046201,UP000008143,UP000008143
1300,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,76.2238,86.4802,ENSP00000360502,homo_sapiens,9606,ENSLOCG00000005849,76.2238,...,ENSLOCP00000007066,lepisosteus_oculatus,7918,Gnathostomata,ortholog_one2one,W5MFA7,Ensembl_PRO,ENSLOCP00000007066,UP000018468,UP000018468
1301,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,69.1142,81.9347,ENSP00000360502,homo_sapiens,9606,ENSDARG00000100397,69.6009,...,ENSDARP00000138871,danio_rerio,7955,Gnathostomata,ortholog_one2one,A0A0R4IUY2,Ensembl_PRO,ENSDARP00000138871,UP000000437,UP000000437
1302,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,67.4825,80.5361,ENSP00000360502,homo_sapiens,9606,ENSORLG00000005135,67.8781,...,ENSORLP00000006480,oryzias_latipes,8090,Gnathostomata,ortholog_one2one,H2LKC9,Ensembl_PRO,ENSORLP00000006480,UP000001038,UP000001038


In [44]:
visual_uniprot_merge_proteins.type.value_counts()

ortholog_one2one          552
other_paralog             365
ortholog_one2many         218
ortholog_many2many         95
within_species_paralog     63
alt_allele                 10
Name: type, dtype: int64

In [41]:
%%time

visual_uniprot_merge_genes = visual.merge(id_mapping_for_merging, left_on='target__id', right_on='target__db_id')
print(visual_uniprot_merge_genes.shape)
visual_uniprot_merge_genes.tail()

(4322, 21)
CPU times: user 12.1 s, sys: 780 ms, total: 12.9 s
Wall time: 12.8 s


Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,...,target__protein_id,target__species,target__taxon_id,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id
4317,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,82.4009,91.3753,ENSP00000360502,homo_sapiens,9606,ENSGALG00000006626,82.0186,...,ENSGALP00000010688,gallus_gallus,9031,Euteleostomi,ortholog_one2one,F1P429,Ensembl,ENSGALG00000006626,UP000000539,UP000000539
4318,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,76.5734,87.8788,ENSP00000360502,homo_sapiens,9606,ENSXETG00000017515,75.6912,...,ENSXETP00000046201,xenopus_tropicalis,8364,Gnathostomata,ortholog_one2one,F7B9P1,Ensembl,ENSXETG00000017515,UP000008143,UP000008143
4319,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,76.2238,86.4802,ENSP00000360502,homo_sapiens,9606,ENSLOCG00000005849,76.2238,...,ENSLOCP00000007066,lepisosteus_oculatus,7918,Gnathostomata,ortholog_one2one,W5MFA7,Ensembl,ENSLOCG00000005849,UP000018468,UP000018468
4320,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,69.1142,81.9347,ENSP00000360502,homo_sapiens,9606,ENSDARG00000100397,69.6009,...,ENSDARP00000138871,danio_rerio,7955,Gnathostomata,ortholog_one2one,A0A0R4IUY2,Ensembl,ENSDARG00000100397,UP000000437,UP000000437
4321,,ENSEMBL_ORTHOLOGUES,ENSG00000095464,67.4825,80.5361,ENSP00000360502,homo_sapiens,9606,ENSORLG00000005135,67.8781,...,ENSORLP00000006480,oryzias_latipes,8090,Gnathostomata,ortholog_one2one,H2LKC9,Ensembl,ENSORLG00000005135,UP000001038,UP000001038


In [45]:
visual_uniprot_merge_genes.type.value_counts()

other_paralog             2391
ortholog_one2one           976
ortholog_one2many          337
ortholog_many2many         326
within_species_paralog     258
alt_allele                  34
Name: type, dtype: int64

## Read in QfO human uniprot ids

In [22]:
human_id_mapping = pd.read_csv('Eukaryota/UP000005640_9606.idmapping', sep='\t', header=None, names=['uniprot_id', 'id_type', 'db_id'])
human_id_mapping.columns = 'source__' + human_id_mapping.columns
print(human_id_mapping.shape)
human_id_mapping.head()

(2668934, 3)


Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
0,P62258-2,UniParc,UPI00001E6021
1,P62258-2,Ensembl_TRS,ENST00000571732
2,P62258-2,Ensembl_PRO,ENSP00000461762
3,P62258-2,Ensembl_TRS,ENST00000616643
4,P62258-2,Ensembl_PRO,ENSP00000481059


In [76]:
visual_uniprot_merge_proteins.head()

Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,...,target__protein_id,target__species,target__taxon_id,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id
0,,ENSEMBL_PARALOGUES,ENSG00000198515,50.5929,62.8458,ENSP00000384264,homo_sapiens,9606,ENSG00000183862,57.8313,...,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640
1,,ENSEMBL_PARALOGUES,ENSG00000170289,22.2497,37.5773,ENSP00000316605,homo_sapiens,9606,ENSG00000183862,27.1084,...,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640
2,,ENSEMBL_PARALOGUES,ENSG00000070729,14.948,26.0592,ENSP00000251102,homo_sapiens,9606,ENSG00000183862,28.1627,...,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640
3,,ENSEMBL_PARALOGUES,ENSG00000198515,50.5929,62.8458,ENSP00000384264,homo_sapiens,9606,ENSG00000183862,57.8313,...,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640
4,,ENSEMBL_PARALOGUES,ENSG00000144191,59.2219,72.6225,ENSP00000272602,homo_sapiens,9606,ENSG00000183862,61.8976,...,ENSP00000328478,homo_sapiens,9606,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640


In [77]:
human_id_mapping.head()

Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
0,P62258-2,UniParc,UPI00001E6021
1,P62258-2,Ensembl_TRS,ENST00000571732
2,P62258-2,Ensembl_PRO,ENSP00000461762
3,P62258-2,Ensembl_TRS,ENST00000616643
4,P62258-2,Ensembl_PRO,ENSP00000481059


## Merge  with human id mapping

In [78]:
%%time 
visual_uniprot_merge_proteins_with_human = visual_uniprot_merge_proteins.merge(
    human_id_mapping, left_on='source__protein_id', right_on='source__db_id', how='outer')
visual_uniprot_merge_proteins_with_human.columns = visual_uniprot_merge_proteins_with_human.columns.str.replace("source__", 'human__')
print(visual_uniprot_merge_proteins_with_human.shape)
visual_uniprot_merge_proteins_with_human.query('type == "ortholog_one2one"').head()

(2670214, 24)
CPU times: user 2.31 s, sys: 416 ms, total: 2.72 s
Wall time: 2.72 s


Unnamed: 0,dn_ds,method_link_type,human__id,human__perc_id,human__perc_pos,human__protein_id,human__species,human__taxon_id,target__id,target__perc_id,...,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id,human__uniprot_id,human__id_type,human__db_id
27,0.35714,ENSEMBL_ORTHOLOGUES,ENSG00000198515,95.2569,95.5204,ENSP00000384264,homo_sapiens,9606.0,ENSGGOG00000013840,99.3132,...,Homininae,ortholog_one2one,G3RDA4,Ensembl_PRO,ENSGGOP00000013502,UP000001519,UP000001519,P29973-2,Ensembl_PRO,ENSP00000384264
28,0.35714,ENSEMBL_ORTHOLOGUES,ENSG00000198515,95.2569,95.5204,ENSP00000384264,homo_sapiens,9606.0,ENSGGOG00000013840,99.3132,...,Homininae,ortholog_one2one,G3RDA4,Ensembl_PRO,ENSGGOP00000013502,UP000001519,UP000001519,P29973-2,Ensembl_PRO,ENSP00000384264
29,0.35714,ENSEMBL_ORTHOLOGUES,ENSG00000198515,95.2569,95.5204,ENSP00000384264,homo_sapiens,9606.0,ENSGGOG00000013840,99.3132,...,Homininae,ortholog_one2one,G3RDA4,Ensembl_PRO,ENSGGOP00000013502,UP000001519,UP000001519,P29973-2,Ensembl_PRO,ENSP00000384264
36,0.09675,ENSEMBL_ORTHOLOGUES,ENSG00000198515,81.1594,85.639,ENSP00000384264,homo_sapiens,9606.0,ENSMUSG00000067220,90.0585,...,Euarchontoglires,ortholog_one2one,P29974,Ensembl_PRO,ENSMUSP00000084464,UP000000589,UP000000589,P29973-2,Ensembl_PRO,ENSP00000384264
37,0.09675,ENSEMBL_ORTHOLOGUES,ENSG00000198515,81.1594,85.639,ENSP00000384264,homo_sapiens,9606.0,ENSMUSG00000067220,90.0585,...,Euarchontoglires,ortholog_one2one,P29974,Ensembl_PRO,ENSMUSP00000084464,UP000000589,UP000000589,P29973-2,Ensembl_PRO,ENSP00000384264


## Spot check known orthologs

```
tr|W5NNY8|W5NNY8_LEPOC Rhodopsin OS=Lepisosteu...	sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens...
```

In [83]:
# Lepisosteus oculatus (Spotted gar)
spotted_gar_rhodopsin = 'W5NNY8'

human_rhodopsin = 'P08100'

In [88]:
human_id_mapping.query('source__uniprot_id == @human_rhodopsin & source__id_type == "Ensembl_PRO"')

Unnamed: 0,source__uniprot_id,source__id_type,source__db_id
1572356,P08100,Ensembl_PRO,ENSP00000296271


In [85]:
id_mapping.head()

Unnamed: 0,uniprot_id,id_type,db_id,species_id,taxa_id
0,Q5PRD0,UniProtKB-ID,143BA_DANRE,UP000000437,UP000000437
1,Q5PRD0,Gene_Name,ywhaba,UP000000437,UP000000437
2,Q5PRD0,Gene_Synonym,ywhab1,UP000000437,UP000000437
3,Q5PRD0,Gene_ORFName,wu:fb80c08,UP000000437,UP000000437
4,Q5PRD0,GI,82592598,UP000000437,UP000000437


In [86]:
id_mapping.query('uniprot_id == @spotted_gar_rhodopsin')

Unnamed: 0,uniprot_id,id_type,db_id,species_id,taxa_id
19284098,W5NNY8,UniProtKB-ID,W5NNY8_LEPOC,UP000018468,UP000018468
19284099,W5NNY8,GI,573885637,UP000018468,UP000018468
19284100,W5NNY8,UniRef100,UniRef100_W5NNY8,UP000018468,UP000018468
19284101,W5NNY8,UniRef90,UniRef90_W5NNY8,UP000018468,UP000018468
19284102,W5NNY8,UniRef50,UniRef50_P35359,UP000018468,UP000018468
19284103,W5NNY8,UniParc,UPI0003CD90EB,UP000018468,UP000018468
19284104,W5NNY8,EMBL,AHAT01020702,UP000018468,UP000018468
19284105,W5NNY8,EMBL-CDS,-,UP000018468,UP000018468
19284106,W5NNY8,NCBI_TaxID,7918,UP000018468,UP000018468
19284107,W5NNY8,RefSeq,XP_006630688.1,UP000018468,UP000018468


In [91]:
visual.query('source__protein_id == "ENSP00000296271" & target__protein_id == "ENSLOCP00000022347"')

Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,target__perc_pos,target__protein_id,target__species,target__taxon_id,taxonomy_level,type
10276,,ENSEMBL_ORTHOLOGUES,ENSG00000163914,79.5977,91.954,ENSP00000296271,homo_sapiens,9606,ENSLOCG00000018246,78.2486,90.3955,ENSLOCP00000022347,lepisosteus_oculatus,7918,Gnathostomata,ortholog_one2many


In [94]:
visual_uniprot_merge_proteins_with_human.query('human__protein_id == "ENSP00000296271" & target__protein_id == "ENSLOCP00000022347"')

Unnamed: 0,dn_ds,method_link_type,human__id,human__perc_id,human__perc_pos,human__protein_id,human__species,human__taxon_id,target__id,target__perc_id,...,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id,human__uniprot_id,human__id_type,human__db_id
1234,,ENSEMBL_ORTHOLOGUES,ENSG00000163914,79.5977,91.954,ENSP00000296271,homo_sapiens,9606.0,ENSLOCG00000018246,78.2486,...,Gnathostomata,ortholog_one2many,W5NNY8,Ensembl_PRO,ENSLOCP00000022347,UP000018468,UP000018468,P08100,Ensembl_PRO,ENSP00000296271


In [80]:
visual_uniprot_merge_proteins_with_human.type.value_counts()

ortholog_one2one          552
other_paralog             365
ortholog_one2many         218
ortholog_many2many         95
within_species_paralog     63
alt_allele                 10
Name: type, dtype: int64

## Write merged TFs to disk

In [81]:
pwd

'/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019'

In [82]:
%time visual_uniprot_merge_proteins_with_human.to_csv('opisthokont_not_human_visual_transduction_ensembl_compara_merged_uniprot.csv.gz', index=False)
%time visual_uniprot_merge_proteins_with_human.to_parquet('opisthokont_not_human_visual_transduction_ensembl_compara_merged_uniprot.parquet', index=False)

CPU times: user 27.7 s, sys: 306 ms, total: 28 s
Wall time: 28.2 s
CPU times: user 2.35 s, sys: 184 ms, total: 2.53 s
Wall time: 2.59 s


## Make set variable for quick membership evalution

In [51]:
visual_orthologs = set(visual_uniprot_merge_proteins_with_human.target__uniprot_id)

### Prove that the set `tf_orthologs` is faster

In [52]:
%timeit 'Q7Z761' in visual_orthologs

26.8 ns ± 0.103 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [53]:
%timeit 'Q7Z761' in visual_uniprot_merge_proteins_with_human.target__uniprot_id

7.43 µs ± 57.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


#### Yep, sets are 3 orders of magnitude faster!

# Read non-human proteins and subset if they are an ortholog of a TF

## Make outdir

In [54]:
# ls Eukaryota/

In [55]:
not_human_outdir = 'Eukaryota/not-human-visual-transduction-fastas/'
! mkdir $not_human_outdir


mkdir: cannot create directory ‘Eukaryota/not-human-visual-transduction-fastas/’: File exists


## How much compute is this?

Number of human transcription factor proteins in the quest for orthologs database


In [56]:
visual_uniprot_merge_proteins_with_human.human__uniprot_id.nunique()

95907

In [57]:
n_human_qfo = 93

In [58]:
visual_uniprot_merge_proteins_with_human.head()

Unnamed: 0,dn_ds,method_link_type,human__id,human__perc_id,human__perc_pos,human__protein_id,human__species,human__taxon_id,target__id,target__perc_id,...,taxonomy_level,type,target__uniprot_id,target__id_type,target__db_id,target__species_id,target__taxa_id,human__uniprot_id,human__id_type,human__db_id
0,,ENSEMBL_PARALOGUES,ENSG00000198515,50.5929,62.8458,ENSP00000384264,homo_sapiens,9606.0,ENSG00000183862,57.8313,...,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640,Q16280,Ensembl_PRO,ENSP00000328478
1,,ENSEMBL_PARALOGUES,ENSG00000170289,22.2497,37.5773,ENSP00000316605,homo_sapiens,9606.0,ENSG00000183862,27.1084,...,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640,Q16280,Ensembl_PRO,ENSP00000328478
2,,ENSEMBL_PARALOGUES,ENSG00000070729,14.948,26.0592,ENSP00000251102,homo_sapiens,9606.0,ENSG00000183862,28.1627,...,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640,Q16280,Ensembl_PRO,ENSP00000328478
3,,ENSEMBL_PARALOGUES,ENSG00000198515,50.5929,62.8458,ENSP00000384264,homo_sapiens,9606.0,ENSG00000183862,57.8313,...,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640,Q16280,Ensembl_PRO,ENSP00000328478
4,,ENSEMBL_PARALOGUES,ENSG00000144191,59.2219,72.6225,ENSP00000272602,homo_sapiens,9606.0,ENSG00000183862,61.8976,...,Bilateria,other_paralog,Q16280,Ensembl_PRO,ENSP00000328478,UP000005640,UP000005640,Q16280,Ensembl_PRO,ENSP00000328478


In [59]:
visual_uniprot_merge_proteins_with_human.type.value_counts()

ortholog_one2one          552
other_paralog             365
ortholog_one2many         218
ortholog_many2many         95
within_species_paralog     63
alt_allele                 10
Name: type, dtype: int64

In [64]:
visual_uniprot_merge_proteins_with_human.query('target__species != "homo_sapiens"').target__uniprot_id.nunique()

344

In [65]:
n_not_human_qfo = 344

In [66]:
n_human_qfo * n_not_human_qfo * 0.0006 / 60 / 60

0.005332


### Whoa so this should take less than an hour?

## Read in protein fastas with screed

In [68]:


for filename in iglob('Eukaryota/not-human-protein-fastas/*.fasta'):
    tf_records = []

    basename = os.path.basename(filename)
    with screed.open(filename) as records:
        for record in records:
            name = record['name']
            record_id = name.split()[0]
            uniprot_id = record_id.split('|')[1]
            if uniprot_id in visual_orthologs:
                tf_records.append(record)
                
    if len(tf_records) > 0:
        print(filename)
        print(f"\tlen(tf_records): {len(tf_records)}")
        with open(f'{not_human_outdir}/{basename}', 'w') as f:
            for record in tf_records:
                f.write(">{name}\n{sequence}\n".format(**record))


Eukaryota/not-human-protein-fastas/UP000000437_7955_additional.fasta
	len(tf_records): 2
Eukaryota/not-human-protein-fastas/UP000008143_8364.fasta
	len(tf_records): 7
Eukaryota/not-human-protein-fastas/UP000001038_8090.fasta
	len(tf_records): 35
Eukaryota/not-human-protein-fastas/UP000002280_13616.fasta
	len(tf_records): 21
Eukaryota/not-human-protein-fastas/UP000000539_9031_additional.fasta
	len(tf_records): 1
Eukaryota/not-human-protein-fastas/UP000000803_7227_additional.fasta
	len(tf_records): 4
Eukaryota/not-human-protein-fastas/UP000018468_7918.fasta
	len(tf_records): 26
Eukaryota/not-human-protein-fastas/UP000002254_9615.fasta
	len(tf_records): 22
Eukaryota/not-human-protein-fastas/UP000001940_6239.fasta
	len(tf_records): 1
Eukaryota/not-human-protein-fastas/UP000000437_7955.fasta
	len(tf_records): 38
Eukaryota/not-human-protein-fastas/UP000002311_559292.fasta
	len(tf_records): 1
Eukaryota/not-human-protein-fastas/UP000000589_10090.fasta
	len(tf_records): 23
Eukaryota/not-human-p

# Script to run

In [69]:
ll /mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota/human-visual-transduction-fastas/

total 32
-rw-r--r-- 1 olga 30517 Jan 15 10:58 human_visual_transduction_proteins.fasta


In [70]:
ll /mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota/not-human-visual-transduction-fastas/ | head

total 238
-rw-r--r-- 1 olga   778 Jan 15 18:25 UP000000437_7955_additional.fasta
-rw-r--r-- 1 olga 18778 Jan 15 18:25 UP000000437_7955.fasta
-rw-r--r-- 1 olga   442 Jan 15 18:25 UP000000539_9031_additional.fasta
-rw-r--r-- 1 olga  9789 Jan 15 18:25 UP000000539_9031.fasta
-rw-r--r-- 1 olga  2463 Jan 15 18:25 UP000000589_10090_additional.fasta
-rw-r--r-- 1 olga 11751 Jan 15 18:25 UP000000589_10090.fasta
-rw-r--r-- 1 olga  2391 Jan 15 18:25 UP000000803_7227_additional.fasta
-rw-r--r-- 1 olga  5988 Jan 15 18:25 UP000000803_7227.fasta
-rw-r--r-- 1 olga 18152 Jan 15 18:25 UP000001038_8090.fasta


In [64]:
%%file qfo_human_vs_opisthokont_tfs.sh
#!/bin/bash
OUTDIR=$HOME/data_sm/kmer-hashing/quest-for-orthologs/analysis/2019/visual-transduction/
mkdir -p $OUTDIR/intermediates
cd $OUTDIR/intermediates

PARQUET=$OUTDIR/qfo-eukaryota-visual-transduction-protein.parquet

EUKARYOTA=/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019/Eukaryota
HUMAN=$EUKARYOTA/human-visual-transduction-fastas/human_visual_transduction_proteins.fasta
NOT_HUMAN=$EUKARYOTA/not-human-visual-transduction-fastas/

conda activate khtools--encodings--compare-cli


time khtools compare-kmers \
    --processes 120 \
    --ksize-min 3 \
    --ksize-max 45 \
    --parquet $PARQUET \
    --intermediate-parquet \
    --fastas2 $HUMAN \
    $NOT_HUMAN/* | tee khtools_compare-kmers.log

Writing qfo_human_vs_opisthokont_tfs.sh


In [65]:
pwd

'/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019'

## Time estimation

taking ~1000 seconds per non-human sequence

In [70]:
n_not_human_qfo_tfs

48947

In [69]:
n_not_human_qfo_tfs * 1000 / 120 / 60 / 60 / 24

4.720968364197531

Okay, so this will take ~4.7 days to compute running on `lrrr`