# Create a table with TCGA data

In [None]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy,matplotlib,regex -a Filippo_Valle -g -r -b -w

In [None]:
# import libraries
import os
import sys
import pandas as pd
import numpy as np
import regex as re
from matplotlib import pyplot as plt
import time

Set *working_dir* to the directory where you downloaded files in *data*

In [None]:
working_dir = "/home/jovyan/work/phd/datasets/cancers/lung/"
os.chdir(working_dir)
dirs = os.listdir("data")

In [None]:
len(dirs)

Prepare a list of genes that satisfied filters described by *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

In [None]:
df_genes = pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).set_index(0)
select_genes = df_genes.index.values

In [None]:
df = df_genes.copy()
#print(len(df['gene']))
#df = df.loc[select_genes,:]
#df=pd.read_csv("miRNA.txt")
df.head()

In [None]:
len(df)

In [None]:
#fpkm
def getFilenameFromDir(directory):
    if ".DS_Store" in directory:
        return None
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

In [None]:
#counts
def getFilenameFromDir(directory):
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.htseq]{6}[\.\_counts]{7}[\.txt]{0,4}[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    print("Nothing found")
    print(os.listdir(directory))

In [None]:
#fpkm-uq
def getFilenameFromDir(directory):
    if ".DS_Store" in directory:
        return None
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM-UQ]{0,8}[\.txt]{0,4}[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile

In [None]:
##miRNA
def getFilenameFromDir(directory):
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.mirbase21\.mirnas\.quantification][\.txt]{0,4}[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile

In [None]:
maxacceptables = 15000
added = len(df.columns)
for i,cdirectory in enumerate(dirs):
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    if ".DS_Store" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    cfile = getFilenameFromDir("data/%s"%cdirectory)
    #put header=0 for miRNA!
    #cdf = pd.read_csv(("%s/data/%s/%s"%(working_dir,cdirectory,cfile)), sep='\t')
    #cdf.drop(labels=["reads_per_million_miRNA_mapped","cross-mapped"], axis=1, inplace=True) #miRNA only
    cdf = pd.read_csv(("data/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)
    cdf.columns = ["gene", cfile[:]]
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]
    cdf.set_index('gene',inplace=True)
    cdf = cdf.reindex(index=df_genes.index)
    old_L = len(df.columns)
    df.insert(0,cdf.keys()[0][:],cdf.values)
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
    if added >= maxacceptables:
        break
print(added, i)

In [None]:
#df = pd.read_csv(("%s/mainTable.csv"%working_dir))

In [None]:
print(("genes:%d\trealizations:%d"%(len(df.index),len(df.columns))))

In [None]:
df.dropna().round().astype(float)

In [None]:
df.dropna(how='all', axis=0).round(decimals=2).to_csv("mainTable_all.csv", index=True)