In [None]:
import os, re, matplotlib, pandas, collections, importlib, sys
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sys.path.append('/Users/dp/pma/')

import sameRiver
import sameRiver.heatmapMaker
import sameRiver.biotypeAdder
import sameRiver.scheme
import sameRiver.positiveCounts
from sameRiver.readsPerGene import *
import sameRiver.metadata.negative_metadata as negative_metadata
import sameRiver.metadata.positive_metadata_all as positive_metadata

# Global file paths.
top = '/Users/dp/pma/dataAndScripts/clip/old_mapping_meta/'
figs_dir = f'/Users/dp/pma/dataAndScripts/clip/figs/'

# Move to top directory and make some output folders.
os.chdir(top)
os.makedirs(f'{top}/logs/', exist_ok=True)
os.makedirs(f'{top}/tables/', exist_ok=True)

# Useful function.
def numeric_columns(df):
    return [x for x in df.columns if (df[x].dtype.kind in 'bifc')]


In [None]:
importlib.reload(sameRiver.heatmapMaker)

#####################################################################
# Heatmap figure early in paper showing reads per gene for RBPs.
#####################################################################

positive_metadata.positive_proteins = [
    'FBL', 'hnRNPC', 'SF3B1', 'PCBP1',
    'CELF1', 'Rbfox1', 'Rbfox2', 'hnRNPD',
    'A1CF', 'FUBP1', 'KHDRBS2', #'RPL5', 
]

#positives = sameRiver.positiveCounts.positiveCounts(positive_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
#positives.save(write_object=True, write_txt=True)

# Using a readsPerGene object to ensure blacklist is applied.
rpg = readsPerGene(f'{top}/data/positive_counts_per_read.txt', scheme_filename=f'{top}/scheme.xlsx')
df = rpg.df

numeric = df[numeric_columns(df)].copy()

# RBPs only:
numeric = numeric.loc[:, [x for x in numeric.columns if any([pos in x for pos in positive_metadata.positive_proteins ])]]

with open(f'{top}/logs/datasets_used_for_heatmap.py', 'w') as f:
    f.write('datasets_used_for_heatmap = ' + str(list(numeric.columns)) + '\n')
    

def figure_index(numeric):
    scheme = sameRiver.scheme.scheme('scheme.xlsx')
    labels = collections.defaultdict(int)
    
    def relabel(cols_to_relabel):
        # This function gives replicate names based on Exp number in order to properly pair WT and MUT by batch.
        
        # Get (Exp number as string, protein, fname).
        as_tuples = [(fname.split('_')[0], scheme.gene_from_fname(fname), fname) for fname in cols_to_relabel]
        # Get (fname, protein, Exp number as float)
        as_tuples = [(fname, p, float(re.search('(\d+)', exp).group(1))) for exp, p, fname in as_tuples]
        # Organize by protein name.
        by_gene = collections.defaultdict(list)
        for (fname, prot, exp_number) in as_tuples:
            by_gene[prot].append((fname, exp_number))
        # Get a mapping of protein name -> [fname1, fname2, ...] ordered by replicate based on exp number.
        for prot in by_gene:
            by_gene[prot] = sorted(by_gene[prot], key=lambda x: x[1])  # Sort by exp number
            by_gene[prot] = [x[0] for x in by_gene[prot]]  # Just keep the fname.
        
        # Get and return the new column names in "A1CF:E34K R2" format.
        rep_numbers = [by_gene[scheme.gene_from_fname(fname)].index(fname)+1 for fname in cols_to_relabel]
        return [f'{scheme.gene_from_fname(fname)} R{n}' for fname, n in zip(cols_to_relabel, rep_numbers)]
    
    new_index = relabel(numeric.columns)
    numeric.columns = new_index

# Change column names from Exp#_protein_BC_BC format to Protein R# format.
figure_index(numeric)

# Subset to the most bound RNAs.
numeric['sum'] = numeric.sum(axis=1)
numeric = numeric.head(1000).copy()  # Used 5000 for the figure for RBPs.
del numeric['sum']

# Make a correlation heatmap. Default is by spearman.
hm = sameRiver.heatmapMaker.heatmapMaker()
hm.heatmap(
    numeric, table_filename=f'{top}/tables/spearman_correlations_randos.xlsx',
    fig_filename=figs_dir + '/spearman_correlations_randos_heatmap.pdf',
    edit_column_names=False, cutoff=10,
    cmap=sns.cubehelix_palette(100, start=.5, rot=-0.5, as_cmap=True))


In [None]:
importlib.reload(sameRiver.heatmapMaker)
#_scheme = sameRiver.scheme.scheme('/Users/dfporter/pma/dataAndScripts/clip/miseq/Runs/190326/meta/scheme.xls')

top_dir = '/Users/dp/pma/dataAndScripts/clip/meta/'
df = pandas.read_csv(f'{top_dir}/data/negative_counts_per_read.txt', sep='\t', index_col=0)

#ba = sameRiver.biotypeAdder.biotypeAdder.load()
#df = ba.add_biotypes_column_from_gene_name(df)

#df['Gene type'] = [re.sub('.*/.*', 'Repetitive element', x) for x in df['Gene type']]
#df['Gene type'] = [re.sub('LTR/.*', 'LTR', x) for x in df['Gene type']]
#df['Gene type'] = [re.sub('DNA/hAT.*', 'DNA/hAT', x) for x in df['Gene type']]
df = df[df['Gene type']!='Repetitive element']

to_del = []
#to_del = [x for x in df.columns if not re.search('PCBP1', x)]
to_del.extend([x for x in df.columns if (re.search('Exp61_PCBP1-100P', x))])
to_del.extend([x for x in df.columns if (re.search('Exp61_PCBP1-dKH', x))])
for col in to_del:
    del df[col]


numeric = df[numeric_columns(df)].copy()

hm = sameRiver.heatmapMaker.heatmapMaker()

figs_dir = '/Users/dp/pma/dataAndScripts/clip/figs/'
tables_dir = './tables/'

if not os.path.exists(tables_dir):
    os.system('mkdir ' + tables_dir)

hm.heatmap(numeric, table_filename=tables_dir + '/spearman_correlations_randos.xlsx',
          fig_filename=figs_dir + '/spearman_correlations_randos_heatmap.pdf',
          edit_column_names=False, cutoff=10)


In [None]:
import sameRiver
from sameRiver import *

from sameRiver.stacked_bargraph import *


importlib.reload(sameRiver.stacked_bargraph)
from sameRiver.stacked_bargraph import *

In [None]:
"""
Figure 1.
FBL and hnRNPC were anti-correlated
"""
import sameRiver.heatmapMaker

importlib.reload(sameRiver.countsO)
importlib.reload(sameRiver.statsForCounts)
importlib.reload(sameRiver.heatmapMaker)


def figure_5b_heatmap(_df):

    df = _df.copy()
    sameRiver.countsO.countsO.drop_odds(df)

    hits_columns = sameRiver.countsO.countsO.find_hits_columns(df, verbose=True)
    
    df = df[hits_columns]
    
    n_cols = len(hits_columns)
    

    df['sum'] = df.sum(axis=1, numeric_only=True)
    df['average'] = [x/max(n_cols, 1) for x in df['sum'].tolist()]
    
    df = df[df['average']>100].copy()
    print("Clustering using {0} RNAs.".format(len(df.index)))
    
    del df['average']
    del df['sum']
    
    plt.clf()
    
    def _log(_):
        try:
            if _ <= 0:
                return 0
            else:
                return np.log10(_)
        except:
                return 0

    #fig, ax = plt.subplots()
    m = df.copy()#df[hits_columns].copy()
    m = m.dropna(axis=0, how='any')

    
    corr_m = m.corr(method='spearman')
    
    corr_m.to_excel('../tables/spearman_correlations.xls')
    
    fig = plt.figure()
    
    sns.set(font_scale=1)
    
    hm = sns.clustermap(corr_m, square=True, yticklabels=1, xticklabels=1,
                     cmap=sns.light_palette('black', as_cmap=True)
                    #cmap=sns.cubehelix_palette(100, start=.5, rot=-2, as_cmap=True))
                    )
    #ax.tick_params(labelsize=1)
    
    plt.savefig('../figs/heatmap_FBL_hnRNPC.pdf')
    plt.show()
    plt.clf()
    plt.close()


def is_usable(col):
    if ('nknown' in col) or ('xPCBP1' in col) or ('ox' in col) or ('xSF3B1' in col) \
    or('hnRNPC:' in col) or ('CSRP2' in col):
        return False
    return True

def in_fig5b(col):
    if re.search('hnRNPC', col) or re.search('FBL', col):
        if not re.search(':', col):
            return True
    return False

hm = sameRiver.heatmapMaker.heatmapMaker()

figs_dir = '/Users/dfporter/pma/dataAndScripts/clip/figs/'
tables_dir = './tables/'

if not os.path.exists(tables_dir):
    os.system('mkdir ' + tables_dir)

hm.heatmap(df, table_filename=tables_dir + '/spearman_correlations_randos.xlsx',
          fig_filename=figs_dir + '/spearman_correlations_randos_heatmap.pdf',
          edit_column_names=False)

df = counts.counts_per_million_df[
    [x for x in counts.counts_per_million_df.columns if in_fig5b(x)]]
if len(df.columns):
    hm.heatmap(df, table_filename=tables_dir + '/FBL_hnRNPC_spearman_correlations.xlsx',
              fig_filename=figs_dir + '/spearman_correlations_FBL_hnRNPC_heatmap.pdf',
              cutoff=50)

df = counts.counts_per_million_df[
    [x for x in counts.counts_per_million_df.columns if (('PCBP1' in x) and ('ox' not in x))]
]
#print(df)
df = counts.counts_per_million_df
hm.heatmap(df, table_filename=tables_dir + '/PCBP1_spearman_correlations.xlsx',
          fig_filename=figs_dir + '/spearman_correlations_PCBP1_heatmap.pdf',
        edit_column_names=False,
          cutoff=100, annot=True)


In [None]:
print(counts.raw_counts_df.columns)

In [None]:
"""
Figure 5A.
First, the data was consistent between replicates (Figure 5A). 
"""

def replicate_heatmap_figure_5a(
    _counts, proteins=set(['FBL', 'hnRNPC'])):
    
    df = _counts.counts_df.copy()
    drop_odds(df)
    
    df = df.dropna(axis=0, how='any')
    
    for protein in proteins:
        
        cols = _counts.cols_of_protein(protein)
        sub = df[cols].copy()
        print(cols)
        sub['sums'] = sub.sum(axis=1)
        #return
        print("Jointplot for ", protein)
        print('---')
        
        if len(cols) < 2:
            continue
            
        #sub = sub.loc[sub['sums']>=0.5]
        x = cols[0]
        y = cols[1]
        _x = np.log10(np.array(sub[cols[0]]))
        _y = np.log10(np.array(sub[cols[1]]))
        #print(_x, _y)
        xmin = _x.min()
        xmax = _x.max()
        ymin = _y.min()
        ymax = _y.max()
        xy = np.array([_x, _y]).T
        print(xy)
        print(xy.ndim)
        print("[:, 0]->", xy[:, 0])
        print("[:, 1]->", xy[:, 1])
        print("[0, :]->", xy[0, :])
        
        def above(arr):
            output = []
            for num in arr:
                num = np.array(num)
                #print(num)
                if not np.isfinite(num).all():
                    #print("infinite")
                    output.append(False)
                elif np.all(num > 1):
                    #print('True')
                    output.append(True)
                else:
                    output.append(False)
            return output #[True] * len(arr)
        print('xy[~np.isnan(xy)]=', xy[~np.isnan(xy)])
        print('np.isfinite(xy)=', np.isfinite(xy))
        print('np.isfinite(xy).all(axis=1)=', np.isfinite(xy).all(axis=1))
        tr = np.isfinite(xy).all(axis=1)
        print('transpose np.isfinite(xy).all(axis=1)=', np.transpose(np.isfinite(xy).all(axis=1)))
        print('tr = ', tr)
        print('tr.ndim', tr.ndim)
        tr = np.array(tr).flatten()
        print(np.array(above(xy)).flatten())
        print('sum:', sum([1 for val in np.array(above(xy)).flatten() if val]))
        print('above: ', xy[np.array(above(xy))])
        print(tr.ndim)
        print('after flatten:', tr)
        print('xy[np.isfinite(xy)]=', xy[np.isfinite(xy)])
        print('xy[np.isfinite(xy).all(axis=1)=', xy[tr])
        print('------')
        xy = xy[np.array(above(xy))]
        if len(xy[:, 0]) == 0:
            continue
        print(xy)
        #print('xy[:, 0]---->', xy[:, 0])
        #print('xy[:, 0]---->', np.array(xy[:, 0]).flatten())
        plt.clf(); plt.close()
        sns.set_style('ticks')
        plt.figure(figsize=(4,4))
        #fig, ax = plt.subplots()
#        ax = axs[0]
        hb = sns.jointplot(np.array(xy[:, 0]), np.array(xy[:, 1]), 
                           #kind='hexbin',#stat_func=kendalltau,
                          #xlim=(0,3), ylim=(0,3),
                          #marginal_kws=dict(rug=True)
                           alpha=0.2, 
                           color='k',
                         #shade=True,
                           
                          )#, bins=1000)#, alpha=0.2)#,data=sub)
        hb.set_axis_labels('Log10 reads/gene Rep 1', 'Log10 reads/gene Rep 2')
#        ax.axis([1* x for x in (0, 1, 0, 1)])
        #cb = fig.colorbar(hb, ax=ax)
        #plt.xlabel('Log10 reads/gene Rep 1')
        #plt.ylabel('Log10 reads/gene Rep 2')
        #plt.tight_layout()
        fig = plt.gcf()
        
        fig.savefig('../figs/{0}_replicates.pdf'.format(protein))
        plt.show()
        plt.clf()
        plt.close()
        print('==')

        
def keep(_df, to_keep=['hnRNPC', 'FBL']):
    
    def has_gene(_x):
        found = False

        for gene in to_keep:
            #if re.search(':', _x):
            #    continue
            m = re.search(gene, _x)

            if m is not None:
                found = True
        if found:
            return _x
        else:
            return 'None'
        
    _df.columns = [has_gene(x) for x in _df.columns]
    
    if 'None' in _df.columns:
        del _df['None']
        
    return _df


#df = pandas.read_csv('/Users/dfporter/pma/miseq/meta/counts.txt', sep='\t')
#_df = keep(df,
#          to_keep=['TTA']
          #to_keep=[ 'CDK4', 'TPGS2', 'UBA2', 'ITPA']
          #to_keep=protein_order)
          #to_keep=['SF3B1'])
#         )

replicate_heatmap_figure_5a(counts)


In [None]:
import glob
def line_nums():
    counts_by_fname = {}
    scheme = sameRiver.scheme.scheme('./scheme.xlsx')
    total_counts = collections.defaultdict(int)
    for bedname in glob.glob('./beds/*bed'):
        _ = sum(1 for i in open(bedname, 'rb'))
        bname = os.path.basename(bedname).split('.bed')[0]
        if bname in scheme.long_basename_to_info:
            total_counts[scheme.gene_from_fname(bname)] += _
            counts_by_fname[bname] = _
    return total_counts, counts_by_fname
total_counts, counts_by_fname = line_nums()
print(total_counts, '\n', counts_by_fname)

In [None]:
rows = []
replicate = collections.defaultdict(int)
for k, v in counts_by_fname.items():
    prot = k#.split('_')[1]
    replicate[prot] += 1
    rows.append({
        'Protein': prot,
        #'Replicate': replicate[prot],
        'Unique mapped reads': v
    })

df = pandas.DataFrame(rows)
randos = negative_metadata.random_proteins + ['HCT116', 'CCIN', 'EPB41L5',
      'PCBP1-100P', 'PCBP1-100Q', 'PCBP1-dKH', 'SF3B1-K700E']
#df = df.loc[[(re.sub(':', '-', x) not in randos) for x in df['Protein']]]
df.sort_values(by='Unique mapped reads', inplace=True, ascending=False)
print(df)
df['Unique mapped reads (millions)'] = df['Unique mapped reads']/1E6
sns.set(font_scale=2)
sns.set_style('ticks')
plt.figure(figsize=(10,10))
g = sns.barplot(x='Protein', y='Unique mapped reads (millions)', data=df, color='k')
plt.xticks(rotation=90)
plt.savefig('../../figs/num_mapped_reads_barchart.pdf')
plt.show()
plt.clf()

In [None]:
import dill

print("Loading {}...".format('data/stats.dill'))
with open('data/stats.dill', 'rb') as f:
    statsO = dill.load(f)
print("...Loaded.")


In [None]:
clipped_names_rna = {
    'ITPA', 'CCIN', 'CDK4', 'TPGS2', 'IDE',
    'UBA2', 'EPB41L5', 'DCTN6',
    #'CAPNS2', 
    'CHMP3',  #<- VPS24 RNA name
    'ETS2',
    #'CMAS', 
    #'CSRP2',
    #'FBL', #'HNRNPC',
    #'PCBP1'
}
clipped_names = clipped_names_rna

exons = ['{}::exon'.format(x) for x in clipped_names_rna]

# Average counts per protein
numeric_cols = statsO.positives.numeric_columns(statsO.positives.counts_per_million_df)

print(numeric_cols)
#counts = [
#    (x, dict(
#        [#(name, np.mean(val)) 
#         (x, x) for x in statsO.negatives.counts_per_million_df.loc[x]]
#    )) for x in exons]
print(os.getcwd())
df = pandas.read_csv('./data/positive_counts_per_read.txt', sep='\t', index_col=0)
numeric_cols = statsO.positives.numeric_columns(df)
#print(df)

# Subset to the RNAs of interest.
counts = []
for x in exons:
    y = dict(df.loc[x, numeric_cols])
    y['gene_name'] = x
    counts.append(y)

def to_prot(x):
    return x.split('_')[1]

df = pandas.DataFrame(counts)
df.index = df['gene_name']

proteins = set([to_prot(x) for x in df.columns])
proteins = proteins & set(clipped_names)
averaged = []
for protein in proteins:
    sub = df.loc[:, [x for x in df.columns if to_prot(x)==protein]]
    
#    print(sub)
    x = dict(sub.mean(axis=1))
    x['Protein'] = protein
    averaged.append(x)
df = pandas.DataFrame(averaged).T
df.columns = df.loc['Protein']
df.fillna(0.)
#del df['name']
df.drop(['Protein'], inplace=True)
print('Post drop')
print(df.columns)

skippy = """
pvals = [
    (x, dict(
        [(name, -np.log10(np.max([1E-6, np.mean(val)])) )  for name, val in statsO.pvals_kde[x].items()]
    )) for x in exons]

def to_df(counts):
    print(counts[0][1].keys())
    rows = []
    for rna, _dict in counts:

        to_del = set()
        for protein, pval in _dict.items():
            if protein not in clipped_names:
                to_del.add(protein)
            
        rows.append({
            'RNA': rna.split('::')[0]
        })
        
        rows[-1].update(_dict)
    df = pandas.DataFrame(rows)
    df = df[df.columns[[(x not in to_del) for x in df.columns]]]
    
    
    df.index = df.RNA
    print('.jsfad;lf,', df)
    return df

df = to_df(pvals)
df = df[df.index]

fig = plt.figure()
sns.heatmap(df, cmap='Greys')
fig.set_figwidth(5)
plt.title('-log10(P value)')
plt.xlabel('Protein')
fig.savefig('../../figs/randos_bind_their_own_rnas_pvalue.pdf')
plt.show()
plt.clf()
for _ in statsO.pvals_kde.items():
    print(_)
    break"""

#df = to_df(counts)
#df = df[df.index]

fig = plt.figure()
df = df.astype(np.float64)
order = df.columns.to_list()
print(df)
print('--')
r_order = sorted(df.index, key=lambda x: order.index(x.split('::')[0]))
print(r_order)
df = df.reindex(r_order)
print(df)
df = np.clip(df, a_min=0, a_max=5E3)

sns.heatmap(df, cmap='Greys')
fig.set_figwidth(5)
plt.xlabel('Protein')
plt.title('Counts per million')
fig.savefig('../../figs/randos_bind_their_own_rnas_counts.pdf')
plt.show()
plt.clf()
#df.replace(np.nan, 0.1, inplace=True)
#df.replace(0, 0.1, inplace=True)

#m = np.clip(df, a_min=0.1, a_max=1E3)
#print(m)
