# Describe tissue types of input data

**Gregory Way, 2019**

Load in previously identified tissue type counts and output a supplementary table.

In [1]:
import os
import pandas as pd

In [2]:
# Load TCGA data
file = os.path.join('data', 'tcga_sample_counts.tsv')
tcga_count_df = pd.read_table(file, sep='\t').rename({'cancertype': 'tissue'}, axis='columns')
tcga_count_df = tcga_count_df.assign(dataset="TCGA")
tcga_count_df.head()

Unnamed: 0,tissue,n =,dataset
0,BRCA,1218,TCGA
1,KIRC,606,TCGA
2,LUAD,576,TCGA
3,THCA,572,TCGA
4,UCEC,567,TCGA


In [3]:
# Load GTEX data
file = os.path.join('data', 'gtex_sample_counts.tsv')
gtex_count_df = pd.read_table(file, sep='\t').rename({'tissuetype': 'tissue'}, axis='columns')
gtex_count_df = gtex_count_df.assign(dataset="GTEX")
gtex_count_df.head()

Unnamed: 0,tissue,n =,dataset
0,Muscle - Skeletal,564,GTEX
1,Skin - Sun Exposed (Lower leg),473,GTEX
2,Thyroid,446,GTEX
3,Adipose - Subcutaneous,442,GTEX
4,Artery - Tibial,441,GTEX


In [4]:
# Load TARGET data
file = os.path.join('data', 'target_sample_counts.tsv')
target_count_df = pd.read_table(file, sep='\t').rename({'cancertype': 'tissue'}, axis='columns')
target_count_df = target_count_df.assign(dataset="TARGET")
target_count_df.head()

Unnamed: 0,tissue,n =,dataset
0,AML,196,TARGET
1,ALL,194,TARGET
2,NBL,162,TARGET
3,WT,132,TARGET
4,AML-IF,32,TARGET


In [5]:
# Combine all data to generate supplementary table
full_count_df = (
    pd.concat([tcga_count_df, gtex_count_df, target_count_df], axis='rows')
    .sort_values(by='tissue', ascending=True)
    .reset_index(drop=True)
)


file = os.path.join('results', 'full_sample_counts.tsv')
full_count_df.to_csv(file, sep='\t', index=False)

full_count_df

Unnamed: 0,tissue,n =,dataset
0,ACC,79,TCGA
1,ALL,194,TARGET
2,AML,196,TARGET
3,AML-IF,32,TARGET
4,Adipose - Subcutaneous,442,GTEX
5,Adipose - Visceral (Omentum),355,GTEX
6,Adrenal Gland,190,GTEX
7,Artery - Aorta,299,GTEX
8,Artery - Coronary,173,GTEX
9,Artery - Tibial,441,GTEX
