In [1]:
%matplotlib inline

from collections import Counter
import glob
import itertools
from itertools import izip
import os

from IPython.core.display import HTML, Image
from matplotlib import gridspec
from matplotlib_venn import venn3, venn2
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import flotilla
from gscripts.general import dataviz
from gscripts.general import parsers
from gscripts.general import region_helpers
from gscripts import GO
from gscripts import miso


img_dir = "/nas3/gpratt/Dropbox/TAF15/Data/MN_data/GP_analysis_RNAseq/"



#Setup Basics

In [2]:
def counts_to_rpkm(featureCountsTable):
    counts = featureCountsTable.ix[:,5:]
    lengths = featureCountsTable['Length']
    mapped_reads = counts.sum()
    return (counts * pow(10,9)).div(mapped_reads, axis=1).div(lengths, axis=0)

def gencode_to_ensembl(gene_list):
    for gene in gene_list:
        yield gene.split(".")[0]
        
def plot_go_enrichment(df, filter_value=None, **kwargs):
    
    new_index = []
    for index, description in izip(df.index, df['GO Term Description']):
        new_index.append(list(index[:-1]) + [description])
    df.index = pd.MultiIndex.from_tuples(new_index)

    go_matrix = df['Bonferroni-corrected Hypergeometric p-Value'].apply(lambda x: -1 * np.log10(x))
    go_matrix = go_matrix.unstack(range(len(go_matrix.index.levels) - 1))
    go_matrix = go_matrix.fillna(0)
    if filter_value is not None:
        go_matrix = go_matrix[go_matrix.apply(max, axis=1) > filter_value]
        
    sns.clustermap(go_matrix, robust=True, **kwargs)

In [4]:
!mkdir -p /nas3/gpratt/projects/fet_family/analysis/human_mn_v2/
!scp -q  tscc-login.sdsc.edu:~/projects/fet_family/analysis/human_mn_v2/*.{final.out,metrics,rpkm} /nas3/gpratt/projects/fet_family/analysis/human_mn_v2/
!scp -q  tscc-login.sdsc.edu:~/projects/fet_family/analysis/human_mn_v1/miso_summary_filtered.csv /nas3/gpratt/projects/fet_family/analysis/human_mn_v1/

In [3]:
gene_id_to_name = region_helpers.gene_id_to_name("/nas3/gpratt/gencode/gencode.v17.annotation.gtf.db")
name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

#MN Read Mapping QC


In [4]:
analysis_dir = "/nas3/gpratt/projects/fet_family/analysis/human_mn_v2/"
fet_clip = parsers.rnaseq_metrics(analysis_dir)

filtered_guttman_clip = fet_clip[[ "Input Reads",
                            #"Reads After Triming",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            ]]

HTML(filtered_guttman_clip.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads After Triming" : parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,

                                     } ))

Unnamed: 0,Input Reads,Uniquely Mapped Reads,Uniquely mapped reads %
FUS_TAF15_shRNA_1,48453046,23319915,93.21%
FUS_TAF15_shRNA_2,42856351,35500133,94.19%
FUS_shRNA_1,48334790,39797138,94.29%
FUS_shRNA_2,46737651,38856955,94.29%
Scrm_1,22513222,18610246,94.27%
Scrm_2,33675077,27513846,94.05%
TAF15_TDP43_shRNA_1,46433859,38598286,94.06%
TAF15_TDP43_shRNA_2,38472528,32216211,94.29%
TAF15_shRNA_1,47587547,39865790,94.46%
TAF15_shRNA_2,39768330,33071863,94.36%


In [5]:
filtered_guttman_clip['Uniquely Mapped Reads'].mean()

32461185.916666668

In [6]:
np.mean([float(item[:-1]) for item in filtered_guttman_clip['Uniquely mapped reads %']])

94.143333333333331

#Load Counts

In [24]:
all_counts = pd.read_table("/nas3/gpratt/Dropbox/TAF15/Data/MN_data/GP_analysis_RNAseq/all_counts.txt", skiprows=1, index_col=0)
all_counts.columns = [os.path.basename(col).split(".")[0] for col in all_counts.columns]
rpkm = counts_to_rpkm(all_counts).T

#Make Metadata

In [25]:
metadata = []
for item in rpkm.index:
    experiment = item.split("_")[:-2]
    if len(experiment) == 0:
        experiment = item.split("_")[:-1]
    experiment = "_".join(experiment)
    replicate = item.split("_")[-1]
    metadata.append(["MN", experiment, replicate, item])

rpkm.index = pd.MultiIndex.from_tuples(metadata, names=['cell_type', "knockdown", "rep", 'exp'])
    
#metadata = pd.DataFrame(metadata).T
#metadata.index.name = "exp"
#metadata.to_csv("/nas3/gpratt/Dropbox/TAF15/Data/MN_data/GP_analysis_RNAseq/sampleInfo.csv")

In [26]:
rpkm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Geneid,ENSG00000223972.4,ENSG00000227232.4,ENSG00000243485.1,ENSG00000237613.2,ENSG00000268020.1,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.2,ENSG00000239945.1,ENSG00000233750.3,ENSG00000237683.5,ENSG00000268903.1,ENSG00000269981.1,ENSG00000239906.1,ENSG00000241860.2,ENSG00000222623.1,ENSG00000241599.1,ENSG00000228463.4,ENSG00000241670.2,ENSG00000237094.6,Unnamed: 24_level_0
cell_type,knockdown,rep,exp,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
MN,FUS,1,FUS_shRNA_1,0.348521,38.587782,2.8913,0.059065,0.0,0.038298,0.078432,0.52452,0,0.075551,1.37994,0.19073,0,0.891646,2.469748,0.692312,0.078775,1.720799,0,1.72046,
MN,FUS,2,FUS_shRNA_2,0.34164,36.515108,1.866434,0.028949,0.56015,0.0,0.076883,0.474613,0,0.083317,1.419004,0.233705,0,1.748083,2.369718,0.0,0.0,1.704674,0,1.77108,
MN,FUS_TAF15,1,FUS_TAF15_shRNA_1,0.477444,38.657118,2.326583,0.0,0.0,0.0,0.0,0.411091,0,0.109967,1.916652,0.370151,0,2.811941,3.202893,0.0,0.0,1.554932,0,2.721369,
MN,FUS_TAF15,2,FUS_TAF15_shRNA_2,0.610676,38.052035,2.772769,0.140751,0.340428,0.045632,0.233627,0.528812,0,0.168785,1.837619,0.170439,0,1.460783,2.693418,0.412441,0.281579,1.887592,0,2.043484,
MN,Scrm,1,Scrm_1,0.239699,31.471945,1.855145,0.0,0.0,0.0,0.0,0.570022,0,0.036806,1.13361,0.185833,0,2.606266,2.038314,0.0,0.0,1.348394,0,1.650017,
MN,Scrm,2,Scrm_2,0.286792,34.302594,1.972994,0.041313,0.0,0.0,0.164577,0.409207,0,0.013211,1.362634,0.200109,0,1.403239,2.284318,0.0,0.0,1.311874,0,1.720185,
MN,TAF15,1,TAF15_shRNA_1,0.364871,34.805379,1.603703,0.0292,0.282502,0.075735,0.07755,0.458779,0,0.084039,1.310911,0.094292,0,1.432626,2.32705,0.342262,0.077889,1.557404,0,1.797093,
MN,TAF15,2,TAF15_shRNA_2,0.292417,35.214815,2.305062,0.0,0.339606,0.0,0.046613,0.347694,0,0.078576,1.591974,0.28338,0,1.854689,2.417533,0.0,0.0,1.590839,0,1.884698,
MN,TAF15_TDP43,1,TAF15_TDP43_shRNA_1,0.55753,33.610143,2.477124,0.100392,0.323751,0.0,0.133309,0.822939,0,0.235424,1.885567,0.16209,0,2.146981,2.699755,0.784474,0.0,1.887975,0,2.120606,
MN,TAF15_TDP43,2,TAF15_TDP43_shRNA_2,0.336254,36.152039,2.313267,0.223561,0.720954,0.048319,0.148432,0.64904,0,0.083405,1.672744,0.240636,0,1.828054,2.874041,0.0,0.0,1.550762,0,2.08218,


In [27]:
result = {}
for item in rpkm.index: 
    result[item[-1]] = {key: value for value, key in zip(item, rpkm.index.names)}
sampleInfo = pd.DataFrame(result).T 
sampleInfo.to_csv("/nas3/gpratt/Dropbox/TAF15/Data/MN_data/GP_analysis_RNAseq/sampleInfo.csv")

all_counts.T.ix[sampleInfo.index].T.to_csv("/nas3/gpratt/Dropbox/TAF15/Data/MN_data/GP_analysis_RNAseq/important_counts.csv")

In [28]:
sampleInfo

Unnamed: 0,cell_type,exp,knockdown,rep
FUS_TAF15_shRNA_1,MN,FUS_TAF15_shRNA_1,FUS_TAF15,1
FUS_TAF15_shRNA_2,MN,FUS_TAF15_shRNA_2,FUS_TAF15,2
FUS_shRNA_1,MN,FUS_shRNA_1,FUS,1
FUS_shRNA_2,MN,FUS_shRNA_2,FUS,2
Scrm_1,MN,Scrm_1,Scrm,1
Scrm_2,MN,Scrm_2,Scrm,2
TAF15_TDP43_shRNA_1,MN,TAF15_TDP43_shRNA_1,TAF15_TDP43,1
TAF15_TDP43_shRNA_2,MN,TAF15_TDP43_shRNA_2,TAF15_TDP43,2
TAF15_shRNA_1,MN,TAF15_shRNA_1,TAF15,1
TAF15_shRNA_2,MN,TAF15_shRNA_2,TAF15,2
