# Create a table with TCGA data

In [1]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy,matplotlib,regex -a Filippo_Valle -g -r -b -w

Author: Filippo_Valle

Last updated: Mon Feb 15 2021

Python implementation: CPython
Python version       : 3.8.6
IPython version      : 7.20.0

pandas    : 1.2.1
numpy     : 1.20.1
matplotlib: 3.3.4
regex     : 2020.11.13

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 5.8.0-43-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

Git hash: 72556f60be055fc5e5014ec70101e134dc576832

Git repo: git@github.com:fvalle1/phd

Git branch: develop

Watermark: 2.1.0



In [2]:
# import libraries
import os
import sys
import pandas as pd
import numpy as np
import regex as re
from matplotlib import pyplot as plt
import time

Set *working_dir* to the directory where you downloaded files in *data*

In [5]:
working_dir = "/home/jovyan/work/phd/miRNA/"
os.chdir(working_dir)
dirs = os.listdir("data")

In [6]:
len(dirs)

1223

Prepare a list of genes that satisfied filters described by *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

In [7]:
df_genes = pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).set_index(0)
select_genes = df_genes.index.values

In [9]:
df = df_genes.copy()
#df = df.loc[select_genes,:]
#df=pd.read_csv("miRNA.txt", index_col=0)
#df.head()

In [10]:
len(df)

16069

In [11]:
#fpkm
def getFilenameFromDir(directory):
    if ".DS_Store" in directory:
        return None
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

In [None]:
##miRNA
def getFilenameFromDir(directory):
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.mirbase21\.mirnas\.quantification][\.txt]{0,4}[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile

In [12]:
maxacceptables = 15000
added = len(df.columns)
for i,cdirectory in enumerate(dirs):
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    if ".DS_Store" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    ## mi RNA
    #cfile = getFilenameFromDir("data_miRNA/%s"%cdirectory) # miRNA only!
    #cdf = pd.read_csv(("%s/data_miRNA/%s/%s"%(working_dir,cdirectory,cfile)), sep='\t') # miRNA only!
    #cdf.drop(labels=["reads_per_million_miRNA_mapped","cross-mapped"], axis=1, inplace=True) #miRNA only
    #cdf.set_index("miRNA_ID", inplace=True)
    #cdf.rename(columns={"read_count": cfile}, inplace=True)
    #old_L = len(df.columns)
    #df = df.join(cdf, on="miRNA_ID", how="outer")
    
    ## FPKM
    cfile = getFilenameFromDir("data/%s"%cdirectory)
    cdf = pd.read_csv(("data/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)
    cdf.columns = ["gene", cfile[:]]
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]
    cdf.set_index('gene',inplace=True)
    cdf = cdf.reindex(index=df_genes.index)
    old_L = len(df.columns)
    df = df.join(cdf, how="outer")
    
    ## common
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
    if added >= maxacceptables:
        break
print(added, i)

1c072d05-bdd8-4d30-ba1d-099ed32ea33a.FPKM.txt.gz
03895e36-d4e0-4849-a0f1-26550373338d.FPKM.txt.gz
5c0f9cb8-cc53-49ce-8384-6acb54cb1855.FPKM.txt.gz
445b096b-502f-4e88-9338-4ad309a14425.FPKM.txt.gz
1f853089-c40d-4bee-9833-837f327b275a.FPKM.txt.gz
cf4d50b3-f391-43ee-8887-10de9bca126e.FPKM.txt.gz
a0c5137f-e9b4-413c-a0b3-96c2c11e68f2.FPKM.txt.gz
1f4c4871-667d-4310-a45a-4a5d5f6b60eb.FPKM.txt.gz
762e3c02-fbce-4778-8d79-8fa732fd399e.FPKM.txt.gz
ef647ec0-3322-4dd2-84ca-83bcd786b765.FPKM.txt.gz
5029a88f-ac46-4a7e-afd0-398cfb17bd5f.FPKM.txt.gz
be6d9906-45f4-4e32-83d0-327fa8c21666.FPKM.txt.gz
46ee3218-2a1f-4d44-8706-d44d7e4b980d.FPKM.txt.gz
07ad35fc-1e35-43b3-a252-193323d5e680.FPKM.txt.gz
8f994f43-e36e-4659-bb8c-ee3f0f91a2f2.FPKM.txt.gz
dc03b9a1-8313-4705-a185-c42ddf9870b7.FPKM.txt.gz
284f0774-1db5-4f86-a26c-0f967fa05d2f.FPKM.txt.gz
01a1c77a-3f93-4561-a079-c771ac054f56.FPKM.txt.gz
92891ee4-51a0-4e4d-b6db-70fbb2a756bc.FPKM.txt.gz
e54638eb-8b35-4cf1-af4e-3411dfbd1ba2.FPKM.txt.gz
d5639603-bb2f-4385-b

In [None]:
#df = pd.read_csv(("%s/mainTable.csv"%working_dir))

In [13]:
print(("genes:%d\trealizations:%d"%(len(df.index),len(df.columns))))

genes:16069	realizations:1222


In [None]:
df.dropna().round().astype(float)

In [14]:
df.dropna(how='all', axis=0).round(decimals=2).to_csv("mainTable_fpkm.csv", index=True)

In [15]:
df.head(2)

Unnamed: 0_level_0,1c072d05-bdd8-4d30-ba1d-099ed32ea33a.FPKM.txt.gz,03895e36-d4e0-4849-a0f1-26550373338d.FPKM.txt.gz,5c0f9cb8-cc53-49ce-8384-6acb54cb1855.FPKM.txt.gz,445b096b-502f-4e88-9338-4ad309a14425.FPKM.txt.gz,1f853089-c40d-4bee-9833-837f327b275a.FPKM.txt.gz,cf4d50b3-f391-43ee-8887-10de9bca126e.FPKM.txt.gz,a0c5137f-e9b4-413c-a0b3-96c2c11e68f2.FPKM.txt.gz,1f4c4871-667d-4310-a45a-4a5d5f6b60eb.FPKM.txt.gz,762e3c02-fbce-4778-8d79-8fa732fd399e.FPKM.txt.gz,ef647ec0-3322-4dd2-84ca-83bcd786b765.FPKM.txt.gz,...,96833a09-14ce-4483-b17d-f4328e0b68f5.FPKM.txt.gz,cc55afdc-78e2-48c5-b1ac-386033852bca.FPKM.txt.gz,f9365670-8213-4263-80aa-1718e6d05dc8.FPKM.txt.gz,21b33b4b-04e9-4205-919f-8ec6b2e0cca0.FPKM.txt.gz,b6a39fab-afc2-4498-bb4c-95adeb388c6f.FPKM.txt.gz,bc4753f9-5ca7-4bc8-b25a-622e1a4cdf5d.FPKM.txt.gz,27a9c6bd-6e26-4055-867c-ab7276492464.FPKM.txt.gz,aa256c1a-55a8-43b1-a553-26444a950087.FPKM.txt.gz,5ab6232c-5cba-47ca-af29-3799b0263a58.FPKM.txt.gz,b4824c2d-fed2-4fad-bf6f-797119546006.FPKM.txt.gz
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,27.440919,61.200776,47.07095,41.191908,24.962862,33.060304,22.183771,18.937602,51.190399,28.24778,...,47.59886,24.580778,25.04981,49.08316,62.290031,41.61978,36.357274,53.215474,24.308677,25.955493
ENSG00000000457,6.066963,3.181323,10.836562,1.204924,4.116579,2.86092,6.463854,6.182857,4.465346,3.625035,...,7.829717,7.607851,8.21028,4.581966,4.892305,5.887027,5.262576,3.916315,7.550424,5.036148
