# Combine binary TISSUES and Bgee datasets

In [29]:
import io
import gzip

import pandas
import requests

## Read TISSUES

In [34]:
tissues_df = pandas.read_table('data/merged.tsv.gz')
tissues_df = tissues_df.fillna(0)

In [35]:
cutoff = 3
tissues_df = tissues_df[(tissues_df.score_experiment_unbiased >= cutoff) | (tissues_df.score_integrated >= cutoff)]
tissues_df['unbiased'] = (tissues_df.score_experiment_unbiased >= cutoff).astype(int)

In [36]:
len(tissues_df)

321516

In [37]:
tissues_df = tissues_df[['uberon_id', 'entrez_gene_id', 'unbiased']]
tissues_df.head()

Unnamed: 0,uberon_id,entrez_gene_id,unbiased
26,UBERON:0000002,60,1
90,UBERON:0000002,218,0
96,UBERON:0000002,226,1
104,UBERON:0000002,250,0
119,UBERON:0000002,288,0


## Read Bgee

In [38]:
def open_gz_url(url):
    response = requests.get(url)
    bytes_io = io.BytesIO(response.content)
    return gzip.open(bytes_io, 'rt')

In [39]:
url = 'https://raw.githubusercontent.com/dhimmel/bgee/bd50da6c931675a0316a71ab5c6d7d1bbd35f8bd/data/present-in-adult.tsv.gz'
bgee_df = pandas.read_table(open_gz_url(url))
bgee_df = pandas.melt(bgee_df, id_vars='GeneID', var_name='uberon_id', value_name='present')
bgee_df = bgee_df.query('present == 1')
bgee_df['unbiased'] = 1
bgee_df = bgee_df.rename(columns={'GeneID': 'entrez_gene_id'})
bgee_df = bgee_df[['uberon_id', 'entrez_gene_id', 'unbiased']]
bgee_df.head()

Unnamed: 0,uberon_id,entrez_gene_id,unbiased
1,CL:0000000,2,1
2,CL:0000000,9,1
5,CL:0000000,13,1
6,CL:0000000,14,1
8,CL:0000000,16,1


In [43]:
len(bgee_df)

5202507

## Concatenate Bgee and TISSUES

In [47]:
concat_df = pandas.concat([tissues_df, bgee_df])
concat_df = concat_df.sort('unbiased', ascending=False)
concat_df.duplicated().value_counts()

False    5406177
True      117846
dtype: int64

In [50]:
concat_df = concat_df.drop_duplicates()
concat_df = concat_df.sort(['uberon_id', 'entrez_gene_id'])
len(concat_df)

5406177

In [51]:
concat_df.unbiased.value_counts()

1    5238487
0     167690
dtype: int64

In [52]:
with gzip.open('data/tissues-bgee-combined.tsv.gz', 'wt') as write_file:
    concat_df.to_csv(write_file, sep='\t', index=False)