# Create a table with TCGA data

In [11]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy,matplotlib,regex -a Filippo_Valle -g -r -b -w

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Filippo_Valle 
last updated: Thu Jun 18 2020 

CPython 3.7.6
IPython 7.15.0

pandas 1.0.4
numpy 1.18.5
matplotlib 3.2.1
regex 2020.6.8

compiler   : GCC 7.5.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
Git hash   : 3c8ae8f7082ca41e4f98ab084dc707834cbe547b
Git repo   : git@github.com:fvalle1/phd.git
Git branch : master
watermark 2.0.2


In [None]:
# import libraries
import os
import sys
import pandas as pd
import numpy as np
import regex as re
from matplotlib import pyplot as plt
import time

Set *working_dir* to the directory where you downloaded files in *data*

In [None]:
working_dir = "lung/"
os.chdir(working_dir)
dirs = os.listdir("data")

In [None]:
os.getcwd()

In [None]:
len(dirs)

Prepare a list of genes that satisfied filters described by *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

In [None]:
df_genes = pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).set_index(0)
select_genes = df_genes.index.values

We will store data in df DataFrame with genes as *index* and samples as *columns*

In [None]:
df = df_genes.copy()
df.head(2)

In [None]:
# filter only files with FPKM data
def getFilenameFromDir(directory):
    if ".DS_Store" in directory:
        return None
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

Create the dataframe, this may take a long time

In [None]:
# set the maximum number of samples to insert in the dataset
maxacceptables = 15000

# count the number of added samples
added = len(df.columns)

# iterate c(urrent)directory in downloaded directories
for i,cdirectory in enumerate(dirs):
    # manifest is not a data file
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    # Icon and DS_Store are MacOS files
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    if ".DS_Store" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    
    # current file name
    cfile = getFilenameFromDir("data/%s"%cdirectory)
    
    # sample dataframe
    cdf = pd.read_csv(("data/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)
    cdf.columns = ["gene", cfile[:]]
    
    # get only first 15 characters of gene name
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]
    
    # set genes as index
    cdf.set_index('gene',inplace=True)
    
    # get only protein-coding and filtered genes
    cdf = cdf.reindex(index=df_genes.index)
    
    # number of samples added so far
    old_L = len(df.columns)
    
    #insert new sample
    df.insert(0,cdf.keys()[0][:],cdf.values)
    
    # if something went wrong and data was not added raise exception
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
    
    # break if added more than acceptables
    if added >= maxacceptables:
        break
print(added, i)

In [None]:
print(("genes:%d\tsamples:%d"%(len(df.index),len(df.columns))))

Save data to a .csv file

In [None]:
# drop genes all empty and round (to reduce storage space)
df.dropna(how='all', axis=0).round(decimals=2).to_csv("mainTable_all.csv", index=True)