In [1]:
from Modules import generate_bed_files
from Modules import get_gene_exons
from Modules import exon_dist
import pandas as pd
import numpy as np
import glob
from matplotlib import pyplot as plt
import matplotlib as mpl
import time

In [2]:
def get_target_genes():
    df = pd.read_csv('Datasets/Lab_Pvalb-tpm.tsv', sep='\t', header=0, index_col=0, usecols=[0,1,2])
    df = df.loc[df.index.str.startswith('Hb'),:]
    
    return df.index.tolist()

def generate_references():
    genes = get_target_genes()
    df = get_gene_exons.generate_genome(genelist=genes)
    for gene in genes:
        get_gene_exons.create_gene_reference_file(df, gene)
    
    return

def get_genes_data():
    genes = get_target_genes()
    df = get_gene_exons.get_genes_data(genes)
    
    return df

def get_targets():
    df = get_genes_data()
    targets = generate_bed_files.get_targets_from_dataframe(df)
    
    return targets

def create_bed_files(dataset='Lab_Pvalb'):
    targets = get_targets()
    generate_bed_files.generate_data(targets, dataset, overwrite=True, dataset=dataset)
    
    return

def get_cell_reads(df_data, fname, cell):

    df = pd.read_csv(fname, sep='\t', header=None, names=['Chrome', 'Strand', 'Start', 'End', 'Size'], dtype={'Chrome':str})
    df['Chrome'] = df['Chrome'].str.strip('chr')
    
    for gene, name, chromosome, strand, start, end in df_data.index:
        valids = np.logical_and(df.Start<end, df.End>start)
        valids = np.logical_and(valids, df['Chrome'] == chromosome)
        valids = np.logical_and(valids, df['Strand'] == strand)
        df_target = df.loc[valids,:].copy()
        df_target.Start = np.clip(df_target.Start, start, None)
        df_target.End = np.clip(df_target.End, None, end)
        df_target['Span'] = df_target.End - df_target.Start + 1
        ratio = df_target.Span.values.astype(float) / df_target.Size.values.astype(float)
        df_data.loc[(gene, name, chromosome, strand, start, end), cell] = ratio.sum()
    
    return

def get_read_data(df_data, dataset='Lab_Pvalb'):    
    fnames = glob.glob('/media/soma/Storage1/BedFiles/%s/*.bed' % dataset)
    fnames.sort()
    cells = [fname.split('/')[-1][:-4] for fname in fnames]
    
    columns = [df_data.Gene, df_data.index, df_data.Chromosome, df_data.Strand, df_data.Start, df_data.End]
    names = ('Gene', 'Index', 'Chromosome', 'Strand', 'Start', 'End')
    index = pd.MultiIndex.from_arrays(columns, names=names)
    
    df = pd.DataFrame(np.NaN, index=index, columns=cells)
    
    for fname, cell in zip(fnames, cells):
        get_cell_reads(df, fname, cell)
    
    return df

def adjust_sizes(genes, size):
    for gene in np.unique(genes):
        ind = np.arange(size.size)[genes==gene]
        size[ind[0]] = size[ind[0]] - 35
        size[ind[-1]] = max(10, size[ind[-1]]-35)
    
    return

def compile_read_data(dataset='Lab_Pvalb', base_dataset=''):
    if len(base_dataset) == 0:
        base_dataset = dataset
    df_data = get_gene_exons.get_genes_data(get_target_genes())
    df = get_read_data(df_data, dataset=dataset)
    
    fname = '../../reference data/SRRreferences/SRR%s.txt' % base_dataset
    df_conv = pd.read_csv(fname, sep='\t', header=None, skiprows=1, index_col=0, names=['SRR', 'Cell'])
    df.columns = df_conv.loc[df.columns,'Cell']
    df = df.groupby(df.columns, axis=1).sum()
    
    size = df.index.get_level_values('End') - df.index.get_level_values('Start') + 1
    size = size.values
    adjust_sizes(df.index.get_level_values('Gene'), size)
    size = size.reshape((size.size,1))
    df = df / size * 1000
    
    df.to_csv('exon defs/exon_rates_%s.tsv' % dataset, sep='\t')
    
    return

def get_read_data_normalized_by_gene():
    df = pd.read_csv('exon defs/exon_rates.tsv', sep='\t', header=0, index_col=[0,1,2,3,4])
    
    df_gene = df.groupby(level='Gene', axis=0).sum()
    df_gene = df_gene.loc[df.index.get_level_values('Gene'),:]
    df_gene.index = df.index
    
    df = df / df_gene * 100
    
    return df

def get_stats(df):
    columns = ['Mean', 'Std_Dev', 'Count', 'Std_Err']
    df_stat = pd.DataFrame(np.NaN, index=df.index, columns=columns)
    df_stat.Mean = df.mean(axis=1)
    df_stat.Std_Dev = df.std(axis=1)
    df_stat.Count = df.shape[1] - df.isna().sum(axis=1)
    df_stat.Std_Err = df_stat.Std_Dev / np.sqrt(df_stat.Count)
    
    return df_stat

def get_plot_data():
    df = get_read_data_normalized_by_gene()
    df_stat = get_stats(df)
    
    kept = df_stat.Count > 5
    df = df.loc[kept,:].copy()
    df_stat = df_stat.loc[kept,:].copy()
    
    level = df.index.get_level_values
    arrays = level('Gene'), level('Index')
    names = ('Gene', 'Exon')
    index = pd.MultiIndex.from_arrays(arrays, names=names)
    df.index = index
    df_stat.index = index
    
    return df, df_stat

def plot_gene_map(df, df_stat, gene, ax):
    # trim to data of interest
    df = df.xs(gene, level='Gene')
    df_stat = df_stat.xs(gene, level='Gene')
    
    # set up axis
    ax.set_xticks(np.arange(df_stat.shape[0])+.5)
    ax.set_xticklabels(df_stat.index.tolist(), rotation=90, ha='center', fontsize=6)
    ax.axis([0, df_stat.shape[0], 0, 100])
    ax.set_yticks([0,25,50,75,100])
    ax.set_yticklabels([0,25,50,75,100], fontsize=6)
    ax.set_title(gene, fontsize=10)
    ax.set_ylabel('Expression Rate (%)', fontsize=8)
    
    # plot statistics
    xvals = np.arange(df_stat.shape[0])+.5
    ax.plot(xvals, df_stat.Mean, linewidth=1, color='red', zorder=2)
    ax.fill_between(xvals, df_stat.Mean-df_stat.Std_Err, df_stat.Mean+df_stat.Std_Err,
                    color='#FF6666', alpha=.33, zorder=1, linewidth=0
                   )
    
    # plot individual values
    for column, data in df.iteritems():
        if data.isna().sum() > 0:
            continue
        ax.plot(xvals, data, linewidth=.5, color='grey', zorder=0)
    
    return

def plot_gene_maps(df, df_stat):
    fig = plt.figure(figsize=(8.5,11))
    
    genes = np.unique(df.index.get_level_values('Gene'))
    
    for axnum, gene in enumerate(genes):
        col = axnum % 2
        row = axnum // 2
        
        ax = fig.add_axes([.17 + .4*col, .75 - .15 * row, .33, .10])
        plot_gene_map(df, df_stat, gene, ax)
    
    return fig

In [3]:
datasets = ['GSE60361', 'GSE99888', 'Lab_Pvalb', 'Lab_OLM',
            'Cadwell', 'GSE70844', 'GSE119248', 'Gouwens_Hb',
            'Tasic_Hb'
           ]
with open('Times.txt','w') as w:
    pass

In [4]:
%%time

generate_references()
for dataset in datasets:
    t0 = time.time()
    create_bed_files(dataset=dataset)
    with open('Times.txt','a') as w:
        w.write('Create Bed Files (%s):\t' % dataset + str(time.time() - t0) + '\n')

CPU times: user 2min 14s, sys: 32.2 s, total: 2min 47s
Wall time: 32min 32s


In [5]:
%%time

for dataset in datasets:
    t0 = time.time()
    compile_read_data(dataset=dataset)
    with open('Times.txt','a') as w:
        w.write('Compile Read Data (%s):\t' % dataset + str(time.time() - t0) + '\n')

CPU times: user 2h 23min 54s, sys: 2.48 s, total: 2h 23min 56s
Wall time: 2h 23min 59s


In [6]:
%%time

#generate_references()
for dataset in datasets:
    t0 = time.time()
    exon_dist.generate_data(dataset=dataset)
    with open('Times.txt','a') as w:
        w.write('Generate Distribution (%s):\t' % dataset + str(time.time() - t0) + '\n')

CPU times: user 1d 6h 27min 7s, sys: 41min 46s, total: 1d 7h 8min 53s
Wall time: 1d 7h 9min 57s


In [3]:
df_data = get_gene_exons.get_genes_data(get_target_genes())

In [4]:
df_data

Unnamed: 0_level_0,Gene,Chromosome,Strand,Start,End
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Hba-a1,11,+,32283511,32283801
2a,Hba-a1,11,+,32283811,32283924
2b,Hba-a1,11,+,32283924,32284128
3a,Hba-a1,11,+,32284263,32284438
3b,Hba-a1,11,+,32284438,32284465
1,Hba-a2,11,+,32296489,32296618
2a,Hba-a2,11,+,32296628,32296741
2b,Hba-a2,11,+,32296741,32296945
3a,Hba-a2,11,+,32297080,32297255
3b,Hba-a2,11,+,32297255,32297298
