In [1]:
from glob import iglob
import os

import pandas as pd
import screed
import seaborn as sns
from tqdm import tqdm

In [2]:
cd ~/data_sm/kmer-hashing/quest-for-orthologs/analysis/2019/

/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/analysis/2019


In [3]:
ls -lha

total 6.3M
drwxr-xr-x 3 olga czb 4.0K Jan  8 07:43 [0m[01;34m.[0m/
drwxr-xr-x 3 olga czb 4.0K Dec 25 19:12 [01;34m..[0m/
-rw-r--r-- 1 olga czb 6.2M Sep  6  2018 [01;31mGO.83ebff415cfec35b9ae1888e.tsv.gz[0m
drwxr-xr-x 2 olga czb 128K Jan  8 07:42 [01;34mintermediates[0m/


# Read transcription factors 

In [4]:
tf_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_human_transcription_factors')[0]
print(tf_df.shape)
tf_df.head()

(1639, 5)


Unnamed: 0,Gene,ID,DBD,Motif status (Feb 2018)(Link to human TFs annotation),IUPAC consensus (from selected PWM)
0,AC008770.3,ENSG00000267179,C2H2 ZF,Likely sequence specific TF according to liter...,
1,AC023509.3,ENSG00000267281,bZIP,Known motif – from protein with 100% identical...,RTGACGTCAY
2,AC092835.1,ENSG00000233757,C2H2 ZF,Likely sequence specific TF according to liter...,
3,AC138696.1,ENSG00000264668,C2H2 ZF,Known motif – from protein with 100% identical...,RYGGAGAGTTAGC
4,ADNP,ENSG00000101126,Homeodomain,Likely sequence specific TF according to liter...,


# Go to Quest for Orthologs fastas

In [5]:
cd /home/olga/data_sm/kmer-hashing/quest-for-orthologs/data/2019

/mnt/data_sm/olga/kmer-hashing/quest-for-orthologs/data/2019


In [6]:
ls -lha

total 2.6G
drwxr-xr-x 5 olga root 4.0K Jan  7 11:13 [0m[01;34m.[0m/
drwxr-xr-x 3 olga root 4.0K Dec 25 17:48 [01;34m..[0m/
drwxr-xr-x 5 olga czb  4.0K Dec 26 19:44 [01;34mArchaea[0m/
drwxr-xr-x 5 olga czb   16K Dec 26 19:44 [01;34mBacteria[0m/
drwxr-xr-x 8 olga czb   32K Jan  8 08:13 [01;34mEukaryota[0m/
-rw-r--r-- 1 olga czb  2.6G Dec 25 18:46 [01;31mQfO_release_2019_04.tar.gz[0m
-rw-r--r-- 1 olga czb   18K May 10  2019 README
-rw-r--r-- 1 olga czb   12K Jan  8 07:47 species_metadata.csv


## Read species metadata

In [7]:
species_metadata = pd.read_csv("species_metadata.csv")
print(species_metadata.shape)
species_metadata.head()

(78, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
1,UP000000798,224324,AQUAE,1553,0,1557,Aquifex aeolicus (strain VF5),4290.0,Aquifex aeolicus,strain VF5
2,UP000006548,3702,ARATH,27475,14123,41920,Arabidopsis thaliana (Mouse-ear cress),1496.0,Arabidopsis thaliana,Mouse-ear cress
3,UP000001570,224308,BACSU,4260,7,4268,Bacillus subtilis (strain 168),4290.0,Bacillus subtilis,strain 168
4,UP000001414,226186,BACTN,4782,0,4823,Bacteroides thetaiotaomicron (strain ATCC 2914...,4290.0,Bacteroides thetaiotaomicron,strain ATCC 29148 / DSM 2079 / NCTC 10582 / E5...


### Subset to opisthokonts

In [8]:
# Estimated opisthokonta divergence time from http://timetree.org/
t = 1105
opisthokonts = species_metadata.query('divergence_from_human_mya <= @t')
print(opisthokonts.shape)
opisthokonts.head()

(35, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [9]:
opisthokonts.query('scientific_name == "Homo sapiens"')

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
32,UP000005640,9606,HUMAN,21146,74769,96332,Homo sapiens (Human),0.0,Homo sapiens,Human


In [10]:
opisthokonts_not_human = opisthokonts.query('scientific_name != "Homo sapiens"')
print(opisthokonts_not_human.shape)
opisthokonts_not_human.head()

(34, 10)


Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


# Use ENSEMBL Rest API to get homologous TFs

## Experimenting with ensembl_rest

In [11]:
import ensembl_rest

In [12]:
opisthokonts.head()

Unnamed: 0,proteome_id,tax_id,oscode,n_main_fasta_canonical,n_additional_fasta_isoforms,n_gene2acc,species_name,divergence_from_human_mya,scientific_name,common_name_or_strain
0,UP000007062,7165,ANOGA,12553,971,13619,Anopheles gambiae (African malaria mosquito),797.0,Anopheles gambiae,African malaria mosquito
5,UP000007241,684364,BATDJ,8610,0,8685,Batrachochytrium dendrobatidis (strain JAM81 /...,1105.0,Batrachochytrium dendrobatidis,strain JAM81 / FGSC 10211
6,UP000009136,9913,BOVIN,23774,14534,38438,Bos taurus (Bovine),96.0,Bos taurus,Bovine
8,UP000001554,7739,BRAFL,28542,2,28608,Branchiostoma floridae (Florida lancelet) (Amp...,684.0,Branchiostoma floridae,Florida lancelet
9,UP000001940,6239,CAEEL,19986,8309,28507,Caenorhabditis elegans,797.0,Caenorhabditis elegans,


In [13]:
response = ensembl_rest.homology_ensemblgene('ENSG00000143437', target_taxon=opisthokonts.tax_id, cigar_line=False, sequence=None)
data = response['data']
len(data)

1

In [14]:
from pprint import pprint

In [15]:
homologies = data[0]['homologies']
len(homologies)
pprint(homologies[0])

{'dn_ds': None,
 'method_link_type': 'ENSEMBL_ORTHOLOGUES',
 'source': {'align_seq': 'MAATTANPEMTSDVPSLGPAIASGNSGPGIQGGGAI---VQRAIK-RR-PG-LD---FDDDGE----GN-S---KF--LRCDDDQMSNDKERFARSDDEQSSADKERLA--RENHSEIERRRRNKMTAYITELSDMVPTCSALARKPDKLTILRMAVSHMKSL--------RG-----T--G-----NTSTDGSY--KPSF-LTDQELKHLILEAADGFLFIVSCETGRVVYVSDSVTPV---L-NQPQSEWFGSTLYDQVHPDDVDKLREQLSTSE-NALT-----------------------GRILDLK-----T-----G--TVKKEGQQ---------SSMRMC----------MGSRRSFICRMRCGSSSVDPVSVNRLSFVRNRCRNG-LGSVKDG-EP----HFVVVHCTGYIKAW---PPAG-------VSLPD---DD----------------------PE-------A-GQGSKFCLVAIGRLQVTS-SPNCTDMS-NV-----CQPTE--FISRHNIEG--IFTFVD--HRCVATVGYQPQELLGKNIVEFCHPED-------QQLLRDSFQQVVKLKGQVLSVMFRFRSKNQEWLWMRTSSFTFQNP--YSDEI-EYIICTNTNV-----KNSSQEPRPTLSNTIQRPQLGPTANLPLEMGS--GQLA-PRQQQQQTELDMVPGRDGL-ASYNHS-------Q-V-----V-Q-PVTTTGPEHSKPLEKSDGLFAQDRDPRFSEIYHNINADQSKGISSST-VPATQQLFS---QGNTFPPTPRPAENF-R----NSG-----LA-PPVTIVQP-SA-SAGQMLAQI--------S-----RHSNPTQGATPTWTPTTRS-GF-----SAQ----QVAT-QATAKTR---T-SQ--FGVG

In [16]:
ignore_fields = ['align_seq', 'cigar_line', 'target', 'source']

In [17]:
homologies[0]['target']

{'protein_id': 'FBpp0310182',
 'taxon_id': 7227,
 'id': 'FBgn0002723',
 'perc_id': 17.1788,
 'align_seq': '------------------------------------MAAPETGNTGSTGSAG--STG-----SGSGSGSGSGSSSDPA-------------------------------NGREARNLAEKQRRDKLNASIQELATMVPHAAESSRRLDKTAVLRFATHGLRLQYVFGKSASR-RRKKTGLKGTGMSASPVGD---LPNPSLHLTD----TLM-QLLDCCFLTLTC-SGQIVLVSTSVEQLLGH-C---QSDLYGQNLLQITHPDDQDLLRQQLIPRDIETLFYQHQHHQQQGHNPQQHSTSTSASTSGSDLEEEEMETEEHRLGRQQGEADDDEDHPYNRRTPSPRRMAHLATIDDRLRM-DRRCFTVRLA------------R----------AS--T---RAEATRHYERV--KIDGCFRRSDSSLT-GGAAANYPIVSQ-LIRRSRNNNMLAAAAAVAAEAATVPPQHDAIAQAALHGISGNDIVLVAMARVLREERPPEETEGTVGLTIYRQPEPYQLEYHTRHLIDGSI----IDCDQRIGLVAGYMKDEVRNLSPFCFMHLDDVRWVIVALRQMYDCNSD----YGES---CYRLLSRNGRFIYLHTKG--FLEVDRGSNKVHSF-LCVNTLLDEEAGRQKVQEMKEKFSTI------------------IKAEMPT---QSSSPDL---------P----ASQAPQQLERIVLYLIENLQKS-----------VDSAETVGG-----Q------------G--MESLMD-----D--GYSS----------PANTLTLEELAPSPTPALALVP--------PAPSSVKSSIS--KSVSVVNVTAARKFQQEHQKQRER-------DREQLKERTNSTQGVIRQ---

In [18]:
homology = homologies[0]

In [19]:
def single_homology_to_series(homology):

    homology_for_series = {}
    for key, value in homology.items():
        if not key in ignore_fields:
            homology_for_series[key] = value
        if key in ('target', 'source'):
            for k, v in value.items():
                if not k in ignore_fields:
                    homology_for_series[f"{key}__{k}"] = v

    series = pd.Series(homology_for_series)
    return series

In [20]:
pd.DataFrame(map(single_homology_to_series, homologies))

Unnamed: 0,dn_ds,target__protein_id,target__taxon_id,target__id,target__perc_id,target__perc_pos,target__species,source__species,source__perc_pos,source__perc_id,source__taxon_id,source__id,source__protein_id,method_link_type,taxonomy_level,type
0,,FBpp0310182,7227,FBgn0002723,17.1788,30.4469,drosophila_melanogaster,homo_sapiens,27.6299,15.58940,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Bilateria,ortholog_many2many
1,,FBpp0292296,7227,FBgn0261703,13.6601,24.5047,drosophila_melanogaster,homo_sapiens,29.7845,16.60330,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Bilateria,ortholog_many2many
2,,ENSCSAVP00000018549,51511,ENSCSAVG00000010893,49.8175,66.7883,ciona_savignyi,homo_sapiens,46.3878,34.60080,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Bilateria,ortholog_one2many
3,,FBpp0081483,7227,FBgn0264075,48.5981,62.6168,drosophila_melanogaster,homo_sapiens,50.9506,39.54370,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Bilateria,ortholog_many2many
4,,ENSCINP00000016558,7719,ENSCING00000008102,50.8257,67.8899,ciona_intestinalis,homo_sapiens,46.8948,35.10770,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Bilateria,ortholog_one2many
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,,ENSKMAP00000001530,37003,ENSKMAG00000001221,55.9740,69.7403,kryptolebias_marmoratus,homo_sapiens,68.0608,54.62610,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Euteleostomi,ortholog_one2one
207,,ENSEBUP00000019653,7764,ENSEBUG00000012206,45.6140,63.7427,eptatretus_burgeri,homo_sapiens,13.8150,9.88593,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Vertebrata,ortholog_one2many
208,,ENSEBUP00000009538,7764,ENSEBUG00000006087,61.2360,73.4551,eptatretus_burgeri,homo_sapiens,66.2864,55.25980,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Vertebrata,ortholog_one2many
209,,ENSEBUP00000023036,7764,ENSEBUG00000014190,64.0379,75.0789,eptatretus_burgeri,homo_sapiens,30.1648,25.72880,9606,ENSG00000143437,ENSP00000351407,ENSEMBL_ORTHOLOGUES,Vertebrata,ortholog_one2many


## Actually run it

In [None]:
dfs = []

for ensembl_id in tqdm(tf_df.ID):
    response = ensembl_rest.homology_ensemblgene(ensembl_id, target_taxon=opisthokonts_not_human.tax_id, cigar_line=False, sequence=None)
    data = response['data']
    homologies = data[0]['homologies']
    df = pd.DataFrame(map(single_homology_to_series, homologies))
    dfs.append(df)
tfs_opsithokonts = pd.concat(dfs, ignore_index=True)
print(tfs_opsithokonts.shape)
tfs_opsithokonts.head()

  1%|          | 10/1639 [00:38<1:54:05,  4.20s/it]

In [None]:
df.shape

In [None]:
pd.concat(dfs, ignore_index=True, sort=False)

In [None]:
pwd

In [None]:
tfs_opsithokonts.to_csv('opisthokont_not_human_transcription_factors.csv')