In [1]:
import time 
import numpy as np
import pandas as pd

from tqdm import tqdm
from Bio import Entrez
from bs4 import BeautifulSoup
Entrez.email = 'savandara.besse@gmail.com' #Use your own email

In [2]:
from IPython.display import display, HTML

In [3]:
oskar_hits_tissue_mapping = pd.read_csv('../Data/02_Oskar_analyses/2.12/tissue_stage_binned_with_oskar_results.csv')

In [5]:
len(oskar_hits_tissue_mapping)

2224

In [6]:
#### Number of bioproject
len(np.unique(oskar_hits_tissue_mapping['bioproject'], return_counts=True)[0])

914

In [7]:
#### Number of species available from the transcriptome analysis
len(np.unique(oskar_hits_tissue_mapping['species'], return_counts=True)[0])

949

In [8]:
def oskar_double_hits(x):
    TMP = oskar_hits_tissue_mapping[oskar_hits_tissue_mapping['bioproject'] == x ]
    metadata = list(TMP[['bioproject', 'tax_id', 'species', 'order_name', 'family_name']].values[0]) 
    brain_count = len(TMP[TMP['brain'] == 1])
    metadata.append(brain_count)
    germline_count = len(TMP[TMP['germline'] == 1])
    metadata.append(germline_count)
    tsa_ids = ','.join(list(np.unique(TMP['tsa_id'].values)))
    metadata.append(tsa_ids)
    return metadata

In [9]:
%%time 
oskar_tissue_combined_count = [ oskar_double_hits(bioproject) for bioproject in tqdm(np.unique(oskar_hits_tissue_mapping['bioproject']))]
oskar_per_tissue = pd.DataFrame(oskar_tissue_combined_count, columns = ['bioproject', 'tax_id', 'species', 'order_name', 'family_name', 'combined_brain', 'combined_germline', 'tsa_ids']).sort_values(['order_name', 'family_name'])

100%|██████████| 914/914 [00:02<00:00, 436.51it/s]CPU times: user 2.25 s, sys: 0 ns, total: 2.25 s
Wall time: 2.15 s



In [10]:
#### Species with brain tissue and germline tissue
species_with_double_hits = oskar_per_tissue[(oskar_per_tissue['combined_brain'] >= 1) & (oskar_per_tissue['combined_germline'] >= 1)]
species_with_double_hits.to_csv('../Data/02_Oskar_analyses/2.12/oskar_double_hits.csv', index=False)

In [11]:
print(species_with_double_hits.shape)
species_with_double_hits

(4, 8)


Unnamed: 0,bioproject,tax_id,species,order_name,family_name,combined_brain,combined_germline,tsa_ids
190,PRJNA236239,7159,Aedes aegypti,Diptera,Culicidae,19,7,GFNA00000000.1
202,PRJNA240197,139649,Teleopsis dalmanni,Diptera,Diopsidae,2,1,GBBP00000000.1
695,PRJNA282469,115081,Megalopta genalis,Hymenoptera,Halictidae,1,1,GELL00000000.1
913,PRJNA81039,7130,Manduca sexta,Lepidoptera,Sphingidae,11,12,GETI00000000.1


In [13]:
#### Oskar presence in these species

In [12]:
custom_columns = ['tsa_id', 'bioproject', 'biosample', 'raw_tissue', 'brain', 'germline', 'oskar']
for species in species_with_double_hits['species'].values:
    TMP = oskar_hits_tissue_mapping[(oskar_hits_tissue_mapping['species'] == species)][custom_columns]
    OSKAR = TMP[(TMP['oskar'] == 'found')]
    WHERE_IS_OSKAR = OSKAR[(OSKAR['germline'] != 0) | (OSKAR['brain'] != 0) ]
    if len(WHERE_IS_OSKAR) != 0 : 
        print(species)
        display(HTML(WHERE_IS_OSKAR.to_html()))

Aedes aegypti


Unnamed: 0,tsa_id,bioproject,biosample,raw_tissue,brain,germline,oskar
1893,GFNA00000000.1,PRJNA236239,SAMN02628830,brains,1,0,found
1894,GFNA00000000.1,PRJNA236239,SAMN02628831,brains,1,0,found
1895,GFNA00000000.1,PRJNA236239,SAMN02628832,brains,1,0,found
1896,GFNA00000000.1,PRJNA236239,SAMN02628833,brains,1,0,found
1897,GFNA00000000.1,PRJNA236239,SAMN02628834,brains,1,0,found
1898,GFNA00000000.1,PRJNA236239,SAMN02628835,brains,1,0,found
1899,GFNA00000000.1,PRJNA236239,SAMN02628836,brains,1,0,found
1900,GFNA00000000.1,PRJNA236239,SAMN02628837,brains,1,0,found
1901,GFNA00000000.1,PRJNA236239,SAMN02628879,brains,1,0,found
1902,GFNA00000000.1,PRJNA236239,SAMN02628880,brains,1,0,found
