In [None]:
import sys, os, pandas, matplotlib, re, glob
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sys.path.append('/Users/dp/pma/')
sys.path.append('/Users/dp/pma/dataAndScripts/clip/')
import sameRiver
import sameRiver.metaExp
import sameRiver.scheme
import sameRiver.exp


top_dir = top = "/Users/dp/pma/dataAndScripts/clip/meta//"
figname = lambda _str: f'/Users/dp/pma/dataAndScripts/clip/figs/{_str}.pdf'

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Arial']})

## Write the input files and scripts for DESeq2 analysis of WT vs Mut recurrently mutated RBPS.

In [None]:


cf_lists = [
    ('KHDRBS2', 'KHDRBS2-R168C'),
    ('A1CF', 'A1CF-E34K'),
    #('RPL5', 'RPL5-E82K'),
    ('FUBP1', 'FUBP1-R429C'),
    #('CRNKL1', 'CRNKL1-S128F'),
    ('PCBP1_hp', 'PCBP1-100Q')
]

random_proteins = [
    'CAPNS2',
    #'CCIN', # Dataset too small.
    'CDK4', 'CHMP3',
    'DCTN6',
    'EPB41L5',  # Dataset too small.
    'ETS2', 'IDE',
    'ITPA', 'TPGS2', 'UBA2',
    ]


def prepare_inputs_to_R(counts_fname: str) -> None:
    
    top_dir = os.path.dirname(counts_fname)
    out_dir = f"{top_dir}/for_R"
    os.makedirs(out_dir, exist_ok=True)

    common_R_command = """
y <- DGEList(counts=counts,group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
design <- model.matrix(~batch+group)
y <- estimateDisp(y,design)
fit <- glmQLFit(y,design)
qlf <- glmQLFTest(fit)  # By default, looks at the last coeff, which will be group (after batches).

topTags(qlf)
cpm(y)[rownames(topTags(qlf)),]

# Write.
df = data.frame(qlf$table)
df = df[order(df$F, decreasing=1),]
"""
    
    
    df = pandas.read_csv(counts_fname, sep='\t', index_col=0)
    cmd = ""
    for (a, b) in cf_lists:
        
        a_cols = [x for x in df.columns if a in x and b not in x]
        b_cols = [x for x in df.columns if b in x]
        
        print(f"a_cols={a_cols}\n\nb_cols={b_cols}")
        
        a_batches = [x.split('_')[0] for x in a_cols]
        b_batches = [x.split('_')[0] for x in b_cols]
        
        def if_in(x):
            if bool(x in a_batches):
                return str(a_batches.index(x)+1)
            return '9'
        
        a_batch_n = [str(a_batches.index(_)+1) for _ in a_batches]
        b_batch_n = [if_in(_) for _ in b_batches]
        #b_batch_n = [str(a_batches.index(_)+1) for _ in b_batches]
        
        print(f"a_batch_n={a_batch_n}, b_batch_n={b_batch_n}")
        
        out_df = df.loc[:,a_cols + b_cols]
        out_fname = f"{out_dir}/{a}_vs_{b}_counts.txt"
        out_df.to_csv(out_fname, sep='\t')
        R_cmd = f"""library("readxl"); library('edgeR')
counts = read.csv(file='{out_fname}', sep='\\t', row.names='X')
counts[is.na(counts)] <- 0\n"""
        a_groups = '1,' * len(a_cols)
        b_groups = '2,' * len(b_cols)
        R_cmd += f"group <- factor(c({a_groups}{b_groups.rstrip(',')}))\n"
        R_cmd += f"batch <- factor(c({','.join(a_batch_n)},{','.join(b_batch_n)}))\n"
        R_cmd += common_R_command
        R_cmd += f"write.csv(x=df, file='{top_dir}/tables/{a}_vs_{b}_DESeq2.csv')"
        
        a = a.rstrip('_')
        with open(f"{out_dir}/{a}_vs_{b}_R.txt", 'w') as f:
            f.write(R_cmd)
        cmd += f"; R CMD BATCH {out_dir}/{a}_vs_{b}_R.txt"
        
        # Cf with randoms
        rando_cols = [x for x in df.columns if any([(r in x) for r in random_proteins])]
        randos_proteins = [_.split('_')[1] for _ in rando_cols]
        randos_protein_n = [randos_proteins.index(x) for x in randos_proteins]
        randos_protein_str = map(str, randos_protein_n)
        out_df = df.loc[:,rando_cols+a_cols]
        out_fname = f"{out_dir}/{a}_vs_randoms_counts.txt"
        out_df.to_csv(out_fname, sep='\t')
        
        R_cmd = f"""library("readxl"); library('edgeR')
counts = read.csv(file='{out_fname}', sep='\\t', row.names='X')
counts[is.na(counts)] <- 0\n"""
        r_groups = '1,' * len(rando_cols)
        a_groups = '2,' * len(a_cols)
        R_cmd += f"group <- factor(c({r_groups}{a_groups.rstrip(',')}))\n"
        a_protein_str = str(max(randos_protein_n) + 1) + ','
        
        a_protein_str = a_protein_str * len(a_cols)
        R_cmd += f"batch <- factor(c({','.join(randos_protein_str)},{a_protein_str.rstrip(',')}))\n"
        R_cmd += common_R_command
        
        outfname = re.sub('__', '_', f"{out_dir}/{a}_vs_randoms_R.txt")
        with open(outfname, 'w') as f:
            f.write(R_cmd)        
    print(cmd.lstrip(';'))
    
prepare_inputs_to_R(f'{top_dir}/ann_counts.txt')



In [None]:

def tidy(df):
    df['Name'] = df['Unnamed: 0']
    del df['Unnamed: 0']
    df['Gene'] = [re.sub('>0_', '', x.split('::')[0]) for x in df["Name"]]
    df = df.loc[:, ['Gene', 'Name', 'logFC', 'logCPM', 'F', 'PValue']]
    df['-log10(FDR)'] = -np.log10(df['PValue'])
    #df['Gene'] =
    return df

fnames = glob.glob(f'{top_dir}/tables/*DESeq2.csv')
dfs = {x:tidy(pandas.read_csv(x, sep=',')) for x in fnames}

for name, df in dfs.items():
    df.to_excel(os.path.splitext(name)[0] + '.xlsx')

In [None]:
from matplotlib_venn import venn3
"""venn3(
    subsets,
    set_labels=('A', 'B', 'C'),
    set_colors=('r', 'g', 'b'),
    alpha=0.4,
    normalize_to=1.0,
    ax=None,
    subset_label_formatter=None,
)"""
def targets(df, cutoff=1E-3, fold_change=None, direction='up'):
    s = df.loc[[x<cutoff for x in df['PValue']], :]
    
    if fold_change is not None:
        if direction == 'either':
            s = s.loc[[abs(x)>fold_change for x in s['logFC']], :]
        elif direction == 'up':
            s = s.loc[[x>fold_change for x in s['logFC']], :]
        elif direction == 'down':
            s = s.loc[[x<fold_change for x in s['logFC']], :]
    
    names = s['Gene']
    return set(names)

#['KHDRBS2_vs_KHDRBS2-R168C_DESeq2', 'FUBP1_vs_FUBP1-R429C_DESeq2', 
# 'A1CF_vs_A1CF-E34K_DESeq2', 'PCBP1_hp_vs_PCBP1-100Q_DESeq2', 'PCBP1__vs_PCBP1-100Q_DESeq2']

fnames = glob.glob(f'{top_dir}/tables/*DESeq2.xlsx')
dfs = {os.path.basename(x).split('_DESeq2.xlsx')[0]:pandas.read_excel(x, index_col=0) for x in fnames}
print(dfs.keys())
targets_dict = {name:targets(df) for name,df in dfs.items()}
three = ['KHDRBS2_vs_KHDRBS2-R168C', 'FUBP1_vs_FUBP1-R429C', 'A1CF_vs_A1CF-E34K']

venn3(
    subsets=[targets_dict[x] for x in three],
    set_labels=three, set_colors=('forestgreen', 'indigo', 'cyan'))
plt.show(); plt.clf(); plt.close()

three = ['KHDRBS2_vs_KHDRBS2-R168C', 'PCBP1_hp_vs_PCBP1-100Q', 'A1CF_vs_A1CF-E34K']
venn3(
    subsets=[targets_dict[x] for x in three],
    set_labels=three, set_colors=('forestgreen', 'indigo', 'cyan'))
plt.show(); plt.clf(); plt.close()

targets_dict['a1cf_up'] = targets(dfs['A1CF_vs_A1CF-E34K'], fold_change=np.log(1.5), direction='up')
targets_dict['a1cf_down'] = targets(dfs['A1CF_vs_A1CF-E34K'], fold_change=np.log(1.5), direction='down')
targets_dict['a1cf'] = targets(dfs['A1CF_vs_A1CF-E34K'], fold_change=np.log(1.5), direction='either')
targets_dict['khdrbs2_up'] = targets(dfs['KHDRBS2_vs_KHDRBS2-R168C'], fold_change=np.log(1.5), direction='up')
targets_dict['khdrbs2_down'] = targets(dfs['KHDRBS2_vs_KHDRBS2-R168C'], fold_change=np.log(1.5), direction='down')
targets_dict['khdrbs2'] = targets(dfs['KHDRBS2_vs_KHDRBS2-R168C'], fold_change=np.log(1.5), direction='either')

labels = ['a1cf_up', 'a1cf_down', 'khdrbs2']
venn3(
    subsets=[targets_dict[x] for x in labels],
    set_labels=labels, set_colors=('forestgreen', 'indigo', 'cyan'))
plt.show(); plt.clf(); plt.close()

labels = ['a1cf_up', 'a1cf_down', 'khdrbs2_up']
venn3(
    subsets=[targets_dict[x] for x in labels],
    set_labels=labels, set_colors=('forestgreen', 'indigo', 'cyan'))
plt.show(); plt.clf(); plt.close()


In [None]:
??df.to_dict

In [None]:
fcs = []
for k, df in dfs.items():
    df[f'{k} logFC'] = df['logFC']
    df.index = df['Name']
    fcs.append(df.loc[:,[f'{k} logFC']].T)
    
single = pandas.concat(fcs, sort=True).T

print(single.head())

In [None]:
#print(single.loc['PURPL::exon'])
df = single.dropna()

#print(single)
import scipy

r = scipy.stats.pearsonr(df['KHDRBS2_vs_KHDRBS2-R168C logFC'], df['A1CF_vs_A1CF-E34K logFC'])
print(r, 'R^2=', r[0]**2)
r = scipy.stats.pearsonr(df['KHDRBS2_vs_KHDRBS2-R168C logFC'], df['FUBP1_vs_FUBP1-R429C logFC'])
print(r)
r = scipy.stats.pearsonr(df['A1CF_vs_A1CF-E34K logFC'], df['FUBP1_vs_FUBP1-R429C logFC'])
print(r)
#r = scipy.stats.pearsonr(df['A1CF_vs_A1CF-E34K logFC'], df['FUBP1_vs_FUBP1-R429C logFC'])
#print(r)

sns.lmplot(x='KHDRBS2_vs_KHDRBS2-R168C logFC', y='A1CF_vs_A1CF-E34K logFC',
          data=df, scatter_kws={'alpha': 0.05})

In [None]:
import os, re, matplotlib, pandas, collections, importlib, sys, pickle, random, glob, dill, time
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from importlib import reload
from pathlib import Path

fnames = glob.glob(f'{top_dir}/tables/*DESeq2.csv')


def pass_cutoffs(df, line, title):
    if 'KHDRBS2' in title:
        if df['-log10(FDR)'][line] < 15:
            return False  # Don't label P>0.01 dots.
        if abs(df['logFC'][line]) < np.log2(2):
            return False  # Don't label FC < 1.5
        
    elif 'PCBP1' in title:
        if (df['logFC'][line] > 5) and (df['-log10(FDR)'][line] > 20):
            return True  
        elif (df['logFC'][line] < -5) and (df['-log10(FDR)'][line] > 10):
            return True 
        else:
            return False  # Don't label.
        
    elif 'A1CF' in title:
        if df['-log10(FDR)'][line] > 8:
            return True
        #if abs(df['logFC'][line]) < np.log2(1.5):
        #    return False  # Don't label FC < 1.5
        else:
            return False
        
    else:  # FUBP1 goes here.
        if df['-log10(FDR)'][line] < 2:
            return False  # Don't label P>0.01 dots.
        if abs(df['logFC'][line]) < np.log2(2):
            return False  # Don't label FC < 1.5
        
    return True

def volcano(df: pandas.DataFrame, title=''):
    fig = plt.figure()

    
    print(df.head(2))
    
    p1 = plt.scatter(
        df['logFC'], df['-log10(FDR)'], #'.',
        #color=protein_colors[cf_lists.index(pair)],
        alpha=0.5, s=10,
        color='k',
        #label=pair[1], 
        edgecolors=None, linewidths=0,
    )

    if 'A1CF' in title:
        plt.ylim(0, 25)
    
    title = re.sub('_DESeq2.csv', '', title)
    title = re.sub('_', ' ', title)
    plt.title(title)
    print(f"title={title}")
    # Add text labels to outliers on the plot.
    for line in [_ for _ in range(0,df.shape[0]) if pass_cutoffs(df, _, title)]:

        plt.text(df['logFC'][line]+0.01, df['-log10(FDR)'][line], 
             df['Gene'][line], horizontalalignment='left', 
             size='small', color='black')#, weight='semibold')

    fig.set_figwidth(3)
    fig.set_figheight(3)
    #plt.tight_layout()
    fig.savefig(figname(f'DESeq2_volcanos_{title}'))
    plt.show()
    plt.clf()
    plt.close()

    
print(fnames)
dfs = {os.path.basename(x):tidy(pandas.read_csv(x, sep=',')) for x in fnames}
[volcano(dfs[title], title=title) for n, title in enumerate(dfs)]

