In [None]:
import os, re, matplotlib, pandas, collections, importlib, sys, pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import __init__
sys.path = __init__.add_paths(sys.path)

from importlib import reload
import sameRiver
from sameRiver import *



In [None]:
def split_types(df, col):
    def later_part(_str):
        try:
            return _str.split('::')[1]
        except:
            return ''
    exons = [later_part(x) == 'exon' for x in df[col]]
    introns = [later_part(x) == 'intron' for x in df[col]]
    return df[exons].copy(), df[introns].copy()

In [None]:
#df = pandas.read_csv('/Users/dfporter/pma/dataAndScripts/clip/miseq/meta/ann_counts.txt', sep='\t', index_col=0)
top_dir = '/Users/dp/pma/dataAndScripts/clip/meta/'
df = pandas.read_csv(f'{top_dir}/ann_counts.txt', sep='\t', index_col=0)
df['gene'] = df.index

exons, introns = split_types(df, 'gene')
results = []
for col in df.columns:
    if col in ['gene', 'gene_name', 'Gene type']:
        continue
    if re.search('nknown', col):
        continue
    if re.search('No vec', col):
        continue
        
    skip = False
    for pat in ['PCBP', '100', 'SF3B', 'nknown', 'No vec', 'hnRNPC:']:
        if re.search(pat, col):
            skip = True
    if skip:
        continue
        
    exonic = exons[col].sum()
    intronic = introns[col].sum()
    total = exonic + intronic
    if total < 1E3:
        continue
    results.append({
        '% Exonic': 100 * exonic/total,
        '% Intronic': 100 *intronic/total,
        'Total': exonic + intronic,
        'Total (log10)': np.log10(exonic + intronic),
        'Intronic / total': (intronic/total) / np.log10(exonic + intronic),
        'Protein_rep': col,
        'Protein': col.split('_')[1],
    })

df = pandas.DataFrame(results)
df.sort_values(by='% Intronic', inplace=True)

_kwargs = {'palette': sns.cubehelix_palette(2*len(set(df['Protein'])), start=0, rot=-0.2)}

fig = plt.figure()
sns.lmplot(data=df, x='Total (log10)', y='% Intronic')
fig.set_figheight(10)
plt.show()
plt.clf()

fig = plt.figure()

sns.barplot(data=df, y='Protein', x='% Intronic', **_kwargs)
fig.set_figheight(12)
plt.show()
plt.clf()

In [None]:
excel_of_target_RNAs = '/Users/dp/pma/dataAndScripts/clip/miseq/meta/tables/pvals.xlsx'


#excel_of_target_RNAs = '/Users/dfporter/pma/dataAndScripts/clip/miseq/meta/ann_counts.txt'

xl = pandas.ExcelFile(excel_of_target_RNAs)

results = []
for sheet_name in xl.sheet_names:
    print(sheet_name)

    skip = False
    for pat in ['PCBP', '100', 'SF3B', 'nknown', 'No vec', 'hnRNPC:', 'hnRNPC ']:
        if re.search(pat, sheet_name):
            skip = True
    if skip:
        continue

    df = xl.parse(sheet_name)
    print(df.iloc[0])
    if len(df.index) == 0:
        continue
        
    df['gene'] = df.index
    df = df[df['P value']<0.01]
    exons, introns = split_types(df, 'gene')
    total = len(exons.index) + len(introns.index)
    
    if total < 1:
        print(df)
        continue
    
    n_rnas = len(set([x.split('::')[0] for x in df.gene]))
    
    results.append({
        'Gene': sheet_name,
        '% Exons': 100* len(exons.index)/total,
        '% Introns': 100* len(introns.index)/total,
        'Total': len(exons.index) + len(introns.index),
        'Target RNAs': n_rnas,
    })

df = pandas.DataFrame(results)
print(df)
df.sort_values(by='% Introns', inplace=True)

_kwargs = {'palette': sns.cubehelix_palette(2*len(set(df['Gene'])), start=0, rot=-0.2)}

fig = plt.figure()
sns.barplot(data=df, x='% Introns', y='Gene', **_kwargs)
fig.savefig('/Users/dfporter/pma/dataAndScripts/clip/figs/Percent_introns_in_target_RNAs_bargraph.svg')
plt.show()
plt.clf()

df.sort_values(by='Target RNAs', inplace=True)
fig = plt.figure()
sns.barplot(data=df, x='Target RNAs', y='Gene', **_kwargs)
fig.savefig('/Users/dfporter/pma/dataAndScripts/clip/figs/Number_of_target_RNAs_bargraph.svg')
plt.show()
plt.clf()
