In [26]:
from glob import iglob
import os
from time import sleep


from joblib import Parallel, delayed
import pandas as pd
import screed
import seaborn as sns
from tqdm import tqdm

In [2]:
cd ~/data_sm/kmer-hashing/quest-for-orthologs/analysis/2019/

/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/analysis/2019


In [3]:
ls -lha

total 6.3M
drwxr-xr-x 3 olga czb 4.0K Jan  8 07:43 [0m[01;34m.[0m/
drwxr-xr-x 3 olga czb 4.0K Dec 25 19:12 [01;34m..[0m/
-rw-r--r-- 1 olga czb 6.2M Sep  6  2018 [01;31mGO.83ebff415cfec35b9ae1888e.tsv.gz[0m
drwxr-xr-x 2 olga czb 128K Jan  8 07:42 [01;34mintermediates[0m/


# Read transcription factors 

In [4]:
tf_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_human_transcription_factors')[0]
print(tf_df.shape)
tf_df.head()

(1639, 5)


Unnamed: 0,Gene,ID,DBD,Motif status (Feb 2018)(Link to human TFs annotation),IUPAC consensus (from selected PWM)
0,AC008770.3,ENSG00000267179,C2H2 ZF,Likely sequence specific TF according to liter...,
1,AC023509.3,ENSG00000267281,bZIP,Known motif – from protein with 100% identical...,RTGACGTCAY
2,AC092835.1,ENSG00000233757,C2H2 ZF,Likely sequence specific TF according to liter...,
3,AC138696.1,ENSG00000264668,C2H2 ZF,Known motif – from protein with 100% identical...,RYGGAGAGTTAGC
4,ADNP,ENSG00000101126,Homeodomain,Likely sequence specific TF according to liter...,


# Go to Quest for Orthologs fastas

In [5]:
cd /home/olga/data_sm/kmer-hashing/quest-for-orthologs/data/2019

/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019


In [6]:
ls -lha

total 2.6G
drwxr-xr-x 5 olga root 4.0K Jan  7 11:13 [0m[01;34m.[0m/
drwxr-xr-x 3 olga root 4.0K Dec 25 17:48 [01;34m..[0m/
drwxr-xr-x 5 olga czb  4.0K Dec 26 19:44 [01;34mArchaea[0m/
drwxr-xr-x 5 olga czb   16K Dec 26 19:44 [01;34mBacteria[0m/
drwxr-xr-x 8 olga czb   32K Jan  8 08:13 [01;34mEukaryota[0m/
-rw-r--r-- 1 olga czb  2.6G Dec 25 18:46 [01;31mQfO_release_2019_04.tar.gz[0m
-rw-r--r-- 1 olga czb   18K May 10  2019 README
-rw-r--r-- 1 olga czb   12K Jan  8 07:47 species_metadata.csv


## Read species metadata

In [7]:
species_metadata = pd.read_csv("species_metadata.csv")
print(species_metadata.shape)
species_metadata.head()

(78, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
1,UP000000798,224324,AQUAE,1553,0,1557,Aquifex aeolicus (strain VF5),4290.0,Aquifex aeolicus,strain VF5
2,UP000006548,3702,ARATH,27475,14123,41920,Arabidopsis thaliana (Mouse-ear cress),1496.0,Arabidopsis thaliana,Mouse-ear cress
3,UP000001570,224308,BACSU,4260,7,4268,Bacillus subtilis (strain 168),4290.0,Bacillus subtilis,strain 168
4,UP000001414,226186,BACTN,4782,0,4823,Bacteroides thetaiotaomicron (strain ATCC 2914...,4290.0,Bacteroides thetaiotaomicron,strain ATCC 29148 / DSM 2079 / NCTC 10582 / E5...


### Subset to opisthokonts

In [8]:
# Estimated opisthokonta divergence time from http://timetree.org/
t = 1105
opisthokonts = species_metadata.query('divergence_from_human_mya <= @t')
print(opisthokonts.shape)
opisthokonts.head()

(35, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [9]:
opisthokonts.query('scientific_name == "Homo sapiens"')

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
32,UP000005640,9606,HUMAN,21146,74769,96332,Homo sapiens (Human),0.0,Homo sapiens,Human


In [10]:
opisthokonts_not_human = opisthokonts.query('scientific_name != "Homo sapiens"')
print(opisthokonts_not_human.shape)
opisthokonts_not_human.head()

(34, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


# Use ENSEMBL Rest API to get homologous TFs

## Experimenting with ensembl_rest

In [11]:
import ensembl_rest

In [12]:
opisthokonts.head()

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [13]:
response = ensembl_rest.homology_ensemblgene('ENSG00000143437', target_taxon=opisthokonts.tax_id, cigar_line=False, sequence=None)
data = response['data']
len(data)

1

In [14]:
from pprint import pprint

In [15]:
homologies = data[0]['homologies']
len(homologies)
pprint(homologies[0])

{'dn_ds': None,
 'method_link_type': 'ENSEMBL_ORTHOLOGUES',
 'source': {'align_seq': 'MAATTANPEMTSDVPSLGPAIASGNSGPGIQGGGAI---VQRAIK-RR-PG-LD---FDDDGE----GN-S---KF--LRCDDDQMSNDKERFARSDDEQSSADKERLA--RENHSEIERRRRNKMTAYITELSDMVPTCSALARKPDKLTILRMAVSHMKSL--------RG-----T--G-----NTSTDGSY--KPSF-LTDQELKHLILEAADGFLFIVSCETGRVVYVSDSVTPV---L-NQPQSEWFGSTLYDQVHPDDVDKLREQLSTSE-NALT-----------------------GRILDLK-----T-----G--TVKKEGQQ---------SSMRMC----------MGSRRSFICRMRCGSSSVDPVSVNRLSFVRNRCRNG-LGSVKDG-EP----HFVVVHCTGYIKAW---PPAG-------VSLPD---DD----------------------PE-------A-GQGSKFCLVAIGRLQVTS-SPNCTDMS-NV-----CQPTE--FISRHNIEG--IFTFVD--HRCVATVGYQPQELLGKNIVEFCHPED-------QQLLRDSFQQVVKLKGQVLSVMFRFRSKNQEWLWMRTSSFTFQNP--YSDEI-EYIICTNTNV-----KNSSQEPRPTLSNTIQRPQLGPTANLPLEMGS--GQLA-PRQQQQQTELDMVPGRDGL-ASYNHS-------Q-V-----V-Q-PVTTTGPEHSKPLEKSDGLFAQDRDPRFSEIYHNINADQSKGISSST-VPATQQLFS---QGNTFPPTPRPAENF-R----NSG-----LA-PPVTIVQP-SA-SAGQMLAQI--------S-----RHSNPTQGATPTWTPTTRS-GF-----SAQ----QVAT-QATAKTR---T-SQ--FGVG

In [16]:
ignore_fields = ['align_seq', 'cigar_line', 'target', 'source']

In [17]:
homologies[0]['target']

{'species': 'drosophila_melanogaster',
 'perc_id': 17.1788,
 'protein_id': 'FBpp0310182',
 'id': 'FBgn0002723',
 'cigar_line': '36D16M2D3M5D16M31D63MD19M3D10M4D3MD12MD17MDM3D112MD10M12DM10D2M2DM3D11M2D14MD12MD91M4D48M4D4M3D18M2D14MD26M18D7M3D7M9DM4D22M11D9M5DM12DM2D6M5DM2D4M10D22M8D11M2D24M7D18M3DM6D11M12D11M5D6M4D3MD4M7D2M7D3M17D17M43D',
 'align_seq': '------------------------------------MAAPETGNTGSTGSAG--STG-----SGSGSGSGSGSSSDPA-------------------------------NGREARNLAEKQRRDKLNASIQELATMVPHAAESSRRLDKTAVLRFATHGLRLQYVFGKSASR-RRKKTGLKGTGMSASPVGD---LPNPSLHLTD----TLM-QLLDCCFLTLTC-SGQIVLVSTSVEQLLGH-C---QSDLYGQNLLQITHPDDQDLLRQQLIPRDIETLFYQHQHHQQQGHNPQQHSTSTSASTSGSDLEEEEMETEEHRLGRQQGEADDDEDHPYNRRTPSPRRMAHLATIDDRLRM-DRRCFTVRLA------------R----------AS--T---RAEATRHYERV--KIDGCFRRSDSSLT-GGAAANYPIVSQ-LIRRSRNNNMLAAAAAVAAEAATVPPQHDAIAQAALHGISGNDIVLVAMARVLREERPPEETEGTVGLTIYRQPEPYQLEYHTRHLIDGSI----IDCDQRIGLVAGYMKDEVRNLSPFCFMHLDDVRWVIVALRQMYDCNSD----YGES---CYRLLSRNGRFIYLHTKG--FLEVDRGSNKVHSF-LCVNTLLDEEAG

In [18]:
homology = homologies[0]

In [19]:
def single_homology_to_series(homology):

    homology_for_series = {}
    for key, value in homology.items():
        if not key in ignore_fields:
            homology_for_series[key] = value
        if key in ('target', 'source'):
            for k, v in value.items():
                if not k in ignore_fields:
                    homology_for_series[f"{key}__{k}"] = v

    series = pd.Series(homology_for_series)
    return series

In [20]:
pd.DataFrame(map(single_homology_to_series, homologies))

Unnamed: 0,dn_ds,target__species,target__perc_id,target__protein_id,target__id,target__taxon_id,target__perc_pos,taxonomy_level,method_link_type,type,source__id,source__protein_id,source__perc_id,source__species,source__perc_pos,source__taxon_id
0,,drosophila_melanogaster,17.1788,FBpp0310182,FBgn0002723,7227,30.4469,Bilateria,ENSEMBL_ORTHOLOGUES,ortholog_many2many,ENSG00000143437,ENSP00000351407,15.58940,homo_sapiens,27.6299,9606
1,,drosophila_melanogaster,13.6601,FBpp0292296,FBgn0261703,7227,24.5047,Bilateria,ENSEMBL_ORTHOLOGUES,ortholog_many2many,ENSG00000143437,ENSP00000351407,16.60330,homo_sapiens,29.7845,9606
2,,ciona_savignyi,49.8175,ENSCSAVP00000018549,ENSCSAVG00000010893,51511,66.7883,Bilateria,ENSEMBL_ORTHOLOGUES,ortholog_one2many,ENSG00000143437,ENSP00000351407,34.60080,homo_sapiens,46.3878,9606
3,,drosophila_melanogaster,48.5981,FBpp0081483,FBgn0264075,7227,62.6168,Bilateria,ENSEMBL_ORTHOLOGUES,ortholog_many2many,ENSG00000143437,ENSP00000351407,39.54370,homo_sapiens,50.9506,9606
4,,ciona_intestinalis,50.8257,ENSCINP00000016558,ENSCING00000008102,7719,67.8899,Bilateria,ENSEMBL_ORTHOLOGUES,ortholog_one2many,ENSG00000143437,ENSP00000351407,35.10770,homo_sapiens,46.8948,9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,,kryptolebias_marmoratus,55.9740,ENSKMAP00000001530,ENSKMAG00000001221,37003,69.7403,Euteleostomi,ENSEMBL_ORTHOLOGUES,ortholog_one2one,ENSG00000143437,ENSP00000351407,54.62610,homo_sapiens,68.0608,9606
207,,eptatretus_burgeri,45.6140,ENSEBUP00000019653,ENSEBUG00000012206,7764,63.7427,Vertebrata,ENSEMBL_ORTHOLOGUES,ortholog_one2many,ENSG00000143437,ENSP00000351407,9.88593,homo_sapiens,13.8150,9606
208,,eptatretus_burgeri,61.2360,ENSEBUP00000009538,ENSEBUG00000006087,7764,73.4551,Vertebrata,ENSEMBL_ORTHOLOGUES,ortholog_one2many,ENSG00000143437,ENSP00000351407,55.25980,homo_sapiens,66.2864,9606
209,,eptatretus_burgeri,64.0379,ENSEBUP00000023036,ENSEBUG00000014190,7764,75.0789,Vertebrata,ENSEMBL_ORTHOLOGUES,ortholog_one2many,ENSG00000143437,ENSP00000351407,25.72880,homo_sapiens,30.1648,9606


## Actually run it

### Separate out samples with ensg ids and not

In [21]:
ensg_rows = tf_df.ID.str.startswith("ENSG")
tfs_non_ensg = tf_df.loc[~ensg_rows]
print(tfs_non_ensg.shape)
tfs_non_ensg.head()

(2, 5)


Unnamed: 0,Gene,ID,DBD,Motif status (Feb 2018)(Link to human TFs annotation),IUPAC consensus (from selected PWM)
165,DUX1,DUX1_HUMAN,Homeodomain,Known motif – In vivo/Misc source [166],ATAATCTGATTAT
166,DUX3,DUX3_HUMAN,Homeodomain,Known motif – In vivo/Misc source [167],TTAATTAAATTAA


In [22]:
tfs_ensg = tf_df.loc[ensg_rows]
print(tfs_ensg.shape)
tfs_ensg.head()

(1637, 5)


Unnamed: 0,Gene,ID,DBD,Motif status (Feb 2018)(Link to human TFs annotation),IUPAC consensus (from selected PWM)
0,AC008770.3,ENSG00000267179,C2H2 ZF,Likely sequence specific TF according to liter...,
1,AC023509.3,ENSG00000267281,bZIP,Known motif – from protein with 100% identical...,RTGACGTCAY
2,AC092835.1,ENSG00000233757,C2H2 ZF,Likely sequence specific TF according to liter...,
3,AC138696.1,ENSG00000264668,C2H2 ZF,Known motif – from protein with 100% identical...,RYGGAGAGTTAGC
4,ADNP,ENSG00000101126,Homeodomain,Likely sequence specific TF according to liter...,


In [23]:
from urllib.error import HTTPError

In [None]:

dfs = []

for ensembl_id in tqdm(tfs_ensg.ID):
    sleep(1)
    try:
        response = ensembl_rest.homology_ensemblgene(ensembl_id, target_taxon=opisthokonts_not_human.tax_id, cigar_line=False, sequence=None)
    except HTTPError:
        # Probably a 503 error, meaning server is busy, so wait 2 seconds and try again
        sleep(2)
        response = ensembl_rest.homology_ensemblgene(ensembl_id, target_taxon=opisthokonts_not_human.tax_id, cigar_line=False, sequence=None)

    data = response['data']
    homologies = data[0]['homologies']
    df = pd.DataFrame(map(single_homology_to_series, homologies))
    dfs.append(df)
tfs_opsithokonts_ensembl = pd.concat(dfs, ignore_index=True)
print(tfs_opsithokonts_ensembl.shape)
tfs_opsithokonts_ensembl.head()


  0%|          | 0/1637 [00:00<?, ?it/s][A
  0%|          | 1/1637 [00:08<3:45:22,  8.27s/it][A
  0%|          | 2/1637 [00:15<3:40:47,  8.10s/it][A
  0%|          | 3/1637 [00:21<3:15:34,  7.18s/it][A
  0%|          | 4/1637 [00:28<3:17:04,  7.24s/it][A
  0%|          | 5/1637 [00:37<3:28:09,  7.65s/it][A
  0%|          | 6/1637 [00:42<3:07:36,  6.90s/it][A
  0%|          | 7/1637 [00:47<2:55:26,  6.46s/it][A
  0%|          | 8/1637 [00:53<2:48:33,  6.21s/it][A
  1%|          | 9/1637 [01:00<3:01:02,  6.67s/it][A
  1%|          | 10/1637 [01:06<2:55:16,  6.46s/it][A
  1%|          | 11/1637 [01:13<2:52:50,  6.38s/it][A
  1%|          | 12/1637 [01:18<2:48:08,  6.21s/it][A
  1%|          | 13/1637 [01:23<2:31:23,  5.59s/it][A
  1%|          | 14/1637 [01:26<2:12:23,  4.89s/it][A
  1%|          | 15/1637 [01:30<2:09:05,  4.78s/it][A
  1%|          | 16/1637 [01:36<2:14:51,  4.99s/it][A
  1%|          | 17/1637 [01:41<2:13:00,  4.93s/it][A
  1%|          | 18/1637 [01:

In [28]:
tfs_opsithokonts_ensembl

Unnamed: 0,dn_ds,method_link_type,source__id,source__perc_id,source__perc_pos,source__protein_id,source__species,source__taxon_id,target__id,target__perc_id,target__perc_pos,target__protein_id,target__species,target__taxon_id,taxonomy_level,type
0,,ENSEMBL_PARALOGUES,ENSG00000267179,24.87920,37.19810,ENSP00000467286,homo_sapiens,9606,ENSG00000175787,17.0813,25.5390,ENSP00000378792,homo_sapiens,9606,Bilateria,other_paralog
1,,ENSEMBL_PARALOGUES,ENSG00000267179,25.60390,37.19810,ENSP00000467286,homo_sapiens,9606,ENSG00000187815,20.1521,29.2776,ENSP00000361791,homo_sapiens,9606,Bilateria,other_paralog
2,0.35552,ENSEMBL_PARALOGUES,ENSG00000267179,26.32850,38.16420,ENSP00000467286,homo_sapiens,9606,ENSG00000172006,20.2602,29.3680,ENSP00000321132,homo_sapiens,9606,Bilateria,other_paralog
3,0.33996,ENSEMBL_PARALOGUES,ENSG00000267179,27.29470,38.88890,ENSP00000467286,homo_sapiens,9606,ENSG00000188629,23.4927,33.4719,ENSP00000415070,homo_sapiens,9606,Bilateria,other_paralog
4,0.18553,ENSEMBL_PARALOGUES,ENSG00000267179,20.28990,25.84540,ENSP00000467286,homo_sapiens,9606,ENSG00000270011,26.1682,33.3333,ENSP00000445323,homo_sapiens,9606,Bilateria,other_paralog
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343148,,ENSEMBL_ORTHOLOGUES,ENSG00000036549,36.43410,52.27020,ENSP00000359837,homo_sapiens,9606,ENSEBUG00000002277,39.2133,56.2575,ENSEBUP00000003097,eptatretus_burgeri,7764,Vertebrata,ortholog_one2one
343149,,ENSEMBL_ORTHOLOGUES,ENSG00000036549,26.91030,33.44410,ENSP00000359837,homo_sapiens,9606,ENSPMAG00000000497,56.6434,70.3963,ENSPMAP00000000552,petromyzon_marinus,7757,Vertebrata,ortholog_one2one
343150,,ENSEMBL_ORTHOLOGUES,ENSG00000036549,28.90370,45.29350,ENSP00000359837,homo_sapiens,9606,ENSCING00000002486,25.1203,39.3648,ENSCINP00000005056,ciona_intestinalis,7719,Chordata,ortholog_one2one
343151,,ENSEMBL_ORTHOLOGUES,ENSG00000036549,7.08749,9.30233,ENSP00000359837,homo_sapiens,9606,ENSCSAVG00000000992,48.4848,63.6364,ENSCSAVP00000001713,ciona_savignyi,51511,Chordata,ortholog_one2one


In [None]:
%debug

### this was a fail -- Parallelize getting orthologs using ensembl ID

In [None]:
# from khtools.ensembl import get_orthologs

In [None]:
# from time import sleep
# from random import randint

# def get_homologies(ensembl_id):
#     # sleep for  random amount of time so we don't get rate-limited
#     sleep(randint(0, 5))
#     response = ensembl_rest.homology_ensemblgene(ensembl_id, target_taxon=opisthokonts_not_human.tax_id, cigar_line=False, sequence=None)
#     data = response['data']
#     homologies = data[0]['homologies']
#     df = pd.DataFrame(map(single_homology_to_series, homologies))
#     return df

# # Maximum of 15 connections to ENSEMBL rest
# dfs = Parallel(n_jobs=8)(delayed(get_homologies)(ensembl_id) for ensembl_id in tqdm(tfs_ensg.ID))
# tfs_opsithokonts_ensembl = pd.concat(dfs, ignore_index=True)
# print(tfs_opsithokonts_ensembl.shape)
# tfs_opsithokonts_ensembl.head()

In [None]:
%debug

In [None]:
df.shape

In [None]:
pd.concat(dfs, ignore_index=True, sort=False)

# Write to disk!

In [29]:
# tfs_opisthokonts = pd.concat([tfs_opsithokonts_symbol, tfs_opsithokonts_ensembl])

In [33]:
tfs_opsithokonts_ensembl.to_csv('opisthokont_not_human_transcription_factors_ensembl_compara.csv', index=False)
tfs_opsithokonts_ensembl.to_parquet('opisthokont_not_human_transcription_factors_ensembl_compara.parquet', index=False)

In [32]:
pwd

'/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019'

In [None]:
ls -lha