# Scripts using counts.txt.

Generally, the raw sequencing is first processed with a script like this:
```python
import sameRiver
import sameRiver.mapping
import sameRiver.exp
import sameRiver.metaExp
import importlib
import sameRiver.rnaDataFileMaker
from used_seq_runs import used_sequencing_runs

def get_RNAs():
    # Make data/rna.data file to assign reads to genes.
    maker = sameRiver.rnaDataFileMaker.rnaDataFileMaker()
    RNAs = maker.make_from_gtf_file(gtf_filename='repeats_and_genome.gtf')
    return RNAs

meta = sameRiver.metaExp.metaExp(file_paths={'top_dir': 'meta/'})

for name, paths in used_sequencing_runs.items():
    meta.add_exp(
        sameRiver.exp.exp(
            name=name,
            file_paths=sameRiver.exp.exp.autogenerate_paths(paths['location'])
        ))

RNAs = get_RNAs()

def counts(ex):
    #####
    # Make counts file.
    # Outputs a data/bed_x.data file that holds signal, w/o RNA information:
    ex.make_signal_data_file()
    # Assigns signal to genes and writes counts.txt:
    ex.make_scheme_signal_RNA_data_files(
        rna_data_object=RNAs)
    # Make ann_counts.txt file. This has simplified
    # column names and biotypes.
    ex.annotate_counts_file()

def process(exp):
    exp.read_scheme()
    exp.split_by_barcode()
    exp.convert_split_r1r2_folder_to_long_filenames()
    exp.preprocess_split_barcodes_folder()
    exp.mapping(clobber=True)
    exp.determine_statistics()
    counts(exp)

[process(meta.exps[exp_name]) for exp_name in meta.exps]

meta.combine_scheme_files()
meta.combine_scheme_stat_files()
meta.combine_counts_files()
meta.combine_wig_files()
meta.annotate_counts_file()
```

This outputs counts.txt. The raw signal information is in beds/read_start/ bedgraph files.
This analysis.ipynb script concerns processing that only needs these inputs, and outputs further processed information
used by more specific notebooks.

## This code demonstrates the re-creation of counts.txt files from bedgraphs.

```python
import os, sys, re, glob, pandas, importlib
sys.path.append('/Users/dp/pma/')
import sameRiver
import sameRiver.exp
import sameRiver.rnaDataFileMaker

importlib.reload(sameRiver.exp)
importlib.reload(sameRiver.rnaDataFileMaker)

top_dir = '/Users/dp/pma/dataAndScripts/clip/old_mapping_meta/'

ex = sameRiver.exp.exp(
    name='00', file_paths=sameRiver.exp.exp.autogenerate_paths(top_dir))

ex.read_scheme()

maker = sameRiver.rnaDataFileMaker.rnaDataFileMaker()
RNAs = maker.make_from_gtf_file(gtf_filename='/Users/dp/pma/repeats_and_genome.gtf')


ex.make_signal_data_file(clobber=True)

# Assigns signal to genes and writes counts.txt:
ex.make_scheme_signal_RNA_data_files(rna_data_object=RNAs)
ex.annotate_counts_file()
```

## Move blacklisted datasets (repeats, empty, contaminated)

In [None]:

black_list = [
                'Exp61_HCT116_GTAGCC_TCA',  # <1,000 reads
                'Exp61_HCT116_GTAGCC_AGT',  # <10,000 reads
                'Exp15_AURKA_TCTGAG_TCA',  # <10,000 reads

                'Exp16_FBL_AGCTAG_CAG',  # Very small dataset.
                'Exp16_hnRNPC_TGAGTG_AGT',
                'Exp16_hnRNPC_TGAGTG_CAG',
                'Exp28_hnRNPC_CGATTA_AAC',
                'Exp31_UBA2_TGAGTG_AAC',# Too correlated with CDK4 AAC Exp31.
                'Exp31_UBA2_TGAGTG_CAG', # Empty.
                'Exp31_UBA2_TGAGTG_CAG',
                'Exp31_CDK4_GCCATG_AAC', # Too correlated with UBA2 AAC Exp31.
                'Exp31_CDK4_GCCATG_CAG', # Empty.
                'Exp33_CDK4_GCCATG_AGT',
                'Exp31_CDK4_GCCATG_CAG',
                'Exp33_CAPNS6_CACTGT_TCA', # Empty.
                'Exp61_PCBP1-100P_GCCATG_TCA',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1-100P_GCCATG_AGT',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1-100Q_AGCTAG_TCA',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1-100Q_AGCTAG_AGT',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1-dKH_ATCGTG_TCA',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1-dKH_ATCGTG_AGT',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1_CGATTA_AGT',  # Miseq run replaced by Hiseq (hp).
                'Exp61_PCBP1_CGATTA_TCA',  # Miseq run replaced by Hiseq (hp).
                'YBX',
                'AURKA'
]
import glob

# Move blacklisted begraphs to another folder, then make a file of total read counts.
top_dir = top = '/Users/dp/pma/dataAndScripts/clip/meta//'

for name in black_list:
    blacklist_dir = f"{top}/beds/blacklisted_read_start/"
    os.makedirs(blacklist_dir, exist_ok=True)
    for fname in glob.glob(f"{top}/beds/read_start/*{name}*wig"):
        cmd = f"mv {fname} {blacklist_dir}/"
        print(cmd)
        os.system(cmd)



## Write the total read numbers.

In [None]:
from sameRiver.total_read_numbers import total_read_numbers
total_read_numbers(folder=f'{top}/beds/read_start/', outfile=f"{top}/data/total_read_numbers.txt")

# Read a counts.txt file and get biotypes, reads per million, and XLs per protein.

In [None]:
top_dir = top = '/Users/dp/pma/dataAndScripts/clip/meta/'
os.chdir(top_dir)

In [None]:
import os, sys, re, glob, pandas, importlib, dill
import numpy as np
sys.path.append('/Users/dp/pma/')
import sameRiver
import sameRiver.exp
import sameRiver.rnaDataFileMaker
import sameRiver.readsPerGene
importlib.reload(sameRiver.readsPerGene)
from sameRiver.readsPerGene import *


rpg = rawReadsPerGene(f'{top_dir}/counts.txt', scheme_filename=f'{top_dir}/scheme.xlsx')
rpg.add_biotypes_column(
#    gtf_filename='/Users/dp/pma/repeats_and_genome.gtf'
    gtf_filename='/opt/genomes/repeats_and_ensembl_release94_GRCh38/combined_tsl1andNA.gtf'
)
rpg.df.to_csv(f'{top_dir}/ann_counts.txt', sep='\t')

rpm = readsPerMillion(rpg, load_total_read_numbers=f"{top}/data/total_read_numbers.txt")
xpp = xlsPerProtein(rpm, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')


In [None]:
# Write File 3.

import xlsxwriter
writer = pandas.ExcelWriter(f'{top}/tables/Table 3 Counts per RNA raw, per read or per protein.xlsx', engine='xlsxwriter')

def order_columns(df):
    df['Gene name'] = df.index
    def sorting(_list):
         return sorted(_list, key=lambda x: x.split('_')[1] if ('_' in x) else x)[::-1]

    cols = ['Gene name', 'Gene type'] + sorting([x for x in df.columns if x not in ['Gene type', 'Gene name']])
    df = df.loc[:, cols]
    print(df.shape)
    return df

df = order_columns(rpg.df)
df.to_excel(writer, sheet_name='Raw counts', index=False)

df = order_columns(rpm.df)
df.to_excel(writer, sheet_name='Per million reads', index=False)

df = order_columns(xpp.df)
df.to_excel(writer, sheet_name='Per 1E10 proteins', index=False)

writer.save()


In [None]:
# Write File 4

import xlsxwriter
writer = pandas.ExcelWriter(f'{top}/tables/Table 4 Significance values.xlsx', engine='xlsxwriter')

pvals = pandas.read_excel(f'{top}/tables/pvals_per_read.xlsx', index_col=0)
pvals = order_columns(pvals)#.head()
#pvals['Gene name'] = pvals.index
pvals.to_excel(writer, sheet_name='Per million reads', index=False)

pvals = pandas.read_excel(f'{top}/tables/pvals_per_protein.xlsx', index_col=0)
pvals = order_columns(pvals)#.head()
#pvals['Gene name'] = pvals.index
pvals.to_excel(writer, sheet_name='Per protein', index=False)

writer.save()

In [None]:
writer = pandas.ExcelWriter(f'{top}/tables/Table 5 Peak locations.xlsx', engine='xlsxwriter')

locs = pandas.read_excel(f"{top}/data/peak_locations.xlsx", index_col=0)
locs = order_columns(locs)
print(locs.head())

locs.to_excel(writer, sheet_name='Peak locations', index=False)
writer.save()

In [None]:
writer = pandas.ExcelWriter(f'{top}/tables/Table 6 DESeq2 of mutation effects in CLIP.xlsx', engine='xlsxwriter')
#writer = xlsxwriter.workbook.Workbook(f'{top}/tables/File 6 DESeq2 of mutation effects in CLIP.xlsx')

rename = {
    'Name': 'RNA',
    'logFC': 'DESeq2 log2 fold change MUT/WT',
    'logCPM': 'DESeq2 log2 counts per million over all libraries',
    'F': 'DEseq2 F-statistics',
    'PValue': 'Pvalue',
    '-log10(FDR)': '-log10(P value)'
}

for dataset in [
    'A1CF_vs_A1CF-E34K_DESeq2', 'FUBP1_vs_FUBP1-R429C_DESeq2',
    'KHDRBS2_vs_KHDRBS2-R168C_DESeq2', 'PCBP1_hp_vs_PCBP1-100Q_DESeq2']:
    fname = f"{top}/tables/{dataset}.xlsx"
    
    

    
    df = pandas.read_excel(fname, index_col=0)
    df.columns = [rename.get(x, x) for x in df.columns]
    print(df.head())
    
    dataset = re.sub('_hp', '', dataset)
    dataset = re.sub('_DESeq2', '', dataset)
    
    df.to_excel(writer, sheet_name=dataset, index=False)

    worksheet = writer.sheets[dataset]
    worksheet.set_column('A:G', 20)
    
   # break
    
writer.save()

In [None]:
import sameRiver.metadata.negative_metadata as negative_metadata

rpg = rawReadsPerGene(f'{top_dir}/ann_counts.txt', scheme_filename=f'{top_dir}/scheme.xlsx')
#rpg.add_biotypes_column(
#    gtf_filename='/Users/dp/pma/repeats_and_genome.gtf'
#    gtf_filename='/opt/genomes/repeats_and_ensembl_release94_GRCh38/combined_tsl1andNA.gtf'
#)
#rpg.df.to_csv(f'{top_dir}/ann_counts.txt', sep='\t')

rpm = readsPerMillion(rpg)

a = rpg.df.sum()

xpp = xlsPerProtein(rpm, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')

def as_fold_change(df):
    negatives = df.loc[:, [x for x in df.columns if any([r in x for r in negative_metadata.random_proteins])]]
    negative_sum = negatives.mean(axis=1)
    
    df = df.head()

    get_numeric_columns = lambda _df: [x for x in _df.columns if (_df[x].dtype.kind in 'bifc')]
    df = df[get_numeric_columns(df)].copy()

    return df.apply(lambda x: x/negative_sum[x.name], axis=1)

import scipy.stats as stats
def ttest(readsPerObj):
    df = readsPerObj.df
    negatives = df.loc[:, [x for x in df.columns if any([r in x for r in negative_metadata.random_proteins])]]
    #print(negatives)
    #df = df.head(1000)
    
    df = df[readsPerObj.numeric_columns(df)].copy()    
    
    proteins = readsPerObj.proteins()
    
    lookup = {protein: readsPerObj.columns_for_a_protein(protein) for protein in ['hnRNPC']}
    
    def ttesty(x):
        neg_vals = negatives.loc[x.name].values
        _d = {}
        for protein, cols in lookup.items():
            tval = stats.ttest_ind(neg_vals, x[cols].values)
            _d[protein] = tval.pvalue
        return _d#pandas.Series(_d)
    
    t = df.apply(lambda s: ttesty(s), axis=1)
    print(t)
    return t

print('ttest:')
fold = ttest(rpm)
print(fold)

In [None]:
fold['_ambiguous']['hnRNPC']
#df = original['user'].apply(pd.Series)
q = pandas.DataFrame(fold.values.tolist(), index=fold.index)
q.to_excel(f'{top}/tables/pvals_by_ttest.xlsx')

# Perform statistics on the counts file.

In [None]:
import os, sys, re, glob, pandas, importlib, dill
sys.path.append('/Users/dp/pma/')

import sameRiver.metadata.negative_metadata as negative_metadata
import sameRiver.metadata.positive_metadata_all# as positive_metadata
importlib.reload(sameRiver.metadata.positive_metadata_all)
import sameRiver.negativeCounts
import sameRiver.positiveCounts
import sameRiver.scheme
import sameRiver.statsForCountsNB
importlib.reload(sameRiver.positiveCounts)
importlib.reload(sameRiver.negativeCounts)
importlib.reload(sameRiver.statsForCountsNB)
importlib.reload(sameRiver.scheme)

positive_metadata = sameRiver.metadata.positive_metadata_all

# Reset these paths.
positive_metadata.top_dir = top_dir
positive_metadata.scheme_file = top_dir + '/scheme.xlsx'
positive_metadata.ann_counts_file = top_dir + '/ann_counts.txt'
positive_metadata.bed_file_dir = top_dir + '/beds/'
#positive_metadata.positive_proteins = ['hnRNPC']
print('positives;', positive_metadata.positive_proteins)
negative_metadata.top_dir = top_dir
negative_metadata.scheme_file_with_random_proteins = top_dir + '/scheme.xlsx'
negative_metadata.ann_counts_file = top_dir + '/ann_counts.txt'
negative_metadata.bed_file_dir = top_dir + '/beds/'

print('....', negative_metadata.top_dir)
# If never run before:
negatives = sameRiver.negativeCounts.negativeCounts(negative_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
print('>>>>', negatives.metadata.data_folder)
# Optional: write_txt=True to write some txt's of the data.
negatives.save(write_object=True, write_txt=True)
#print(negatives.lowest_positive_vals)

# If never run before:
positives = sameRiver.positiveCounts.positiveCounts(positive_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
positives.save(write_object=True, write_txt=True)

# If loading:
#positives = sameRiver.positiveCounts.positiveCounts.load(fname='/Users/dp/pma/dataAndScripts/clip/meta/data/positives_countsO.dill')
#str(negatives.lowest_positive_vals['per_read'])[:2000]#['PLEKHN1::exon']
nb = sameRiver.statsForCountsNB.statsForCountsNB(
    negatives=negatives, positives=positives, data_dir=positive_metadata.top_dir + '/data/')

# Write per read pvalues. Masked: if absolute read #/gene below a cutoff, set p value to 1.
nb.calculate_pvalues(which='per_read', test_mode=False)
nb.write_pvals_single_file(which='per_read', outfname=f'{top}/tables/pvals_per_read_unmasked.xlsx')
nb.mask_low_absolute_counts(which='per_read')
nb.write_pvals_single_file(which='per_read', outfname=f'{top}/tables/pvals_per_read.xlsx')

# Write per protein pvalues. Masked: if absolute read #/gene below a cutoff, set p value to 1.
nb.calculate_pvalues(which='per_protein')
nb.write_pvals_single_file(which='per_protein', outfname=f'{top}/tables/pvals_per_protein_unmasked.xlsx')
nb.mask_low_absolute_counts(which='per_read')
nb.write_pvals_single_file(which='per_protein')

In [None]:
import sys, importlib
sys.path.append('/Users/dp/pma/')  # Wherever sameRiver is put.
import sameRiver
import sameRiver.mapping
importlib.reload(sameRiver.mapping)

# Get some utilities.
g = sameRiver.mapping.repeatsGenome()

# If needed:
import sameRiver.gtf
#sameRiver.gtf.subset_to_only_tsl1_and_NA('gencode.v29.primary_assembly.annotation.gtf')
# Outputs gencode.v29.primary_assembly.annotation.gtf.exons_only_tsl1andNA

g.igv_version_of_gtf(
    gtf_filename='/opt/genomes/repeats_and_ensembl_release94_GRCh38/combined_tsl1andNA.gtf',
    out_gtf_fname='/opt/genomes/repeats_and_ensembl_release94_GRCh38/igv_labelling_combined_tsl1andNA.gtf')

g.setup_genomes(
    repeats_fasta_directory='/Users/dp/pma/dataAndScripts/clip/RepEnrich2/hg38re/',
    genomic_gtf='/opt/genomes/gencode.v29/gencode.v29.primary_assembly.annotation.gtf.exons_only_tsl1andNA',
    igv_output_directory='/Users/dp/pma/dataAndScripts/clip/RepEnrich2/for_igv/',
    repeats_gtf='/opt/genomes/temp/repeats_as_separate_chroms.gtf',
    combined_gtf='/opt/genomes/temp/repeats_and_genome.gtf',
    )

In [None]:
random_proteins = [
    'CAPNS2',
    #'CCIN', # Dataset too small.
    'CDK4', 'CHMP3',
    'DCTN6',
    'EPB41L5',  # Dataset too small.
    'ETS2', 'IDE',
    'ITPA', 'TPGS2', 
    'UBA2',
    #'HCT116',
    ]

negatives = sameRiver.negativeCounts.negativeCounts(negative_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
positives = sameRiver.positiveCounts.positiveCounts(positive_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
df = positives.reads_per_million.df.copy()

df = df.loc[:, [not any([x in col for x in random_proteins]) for col in df]]
df = df.loc[:, [df[col].dtype.kind in 'bifc' for col in df.columns]]
#print(df.sort_index(by='Exp92_ETS2_CGAAAC_AGT', ascending=False))
df = df.loc[[x for x in df.index if x not in ['_no_feature', '_ambiguous', 'LSU-rRNA_Hsa::exon', 'SSU-rRNA_Hsa::exon']], :]

_mean = df.mean(axis=1).to_numpy()
print(np.min(_mean))
for col in df:
    df[col] = [(x-m)/m for x,m in zip(df[col], _mean)]
#df = df.apply(np.abs)
df = df.apply(np.log2)
for col in df:
    plt.scatter(x=np.log10(_mean), y=df[col], alpha=0.01)
plt.xlim(0, 4)
plt.ylim(-7, 7)
plt.show()
plt.clf(); plt.close()

In [None]:
positive_proteins = [
    'FBL', 
    'hnRNPC',
    'SF3B1',
    'PCBP1', #'PCBP1-100P', 'PCBP1-100Q', 'PCBP1-dKH',
    'CELF1', 'Rbfox1', 'Rbfox2', 'hnRNPD',
    'A1CF', #'A1CF-E34K',
    #'BARD1', 'BRCA1',
    'CRNKL1', #'CRNKL1-S128F',
    #'DDX3X', 'DDX3X-R528C',
    'FUBP1', #'FUBP1-R429C',
    'KHDRBS2', #'KHDRBS2-R168C',
    #'RARS2', 'RARS2-R6C',
    #'RPL5', 'RPL5-E82K',
    #'SMAD3',
    #'SMAD4', 'SMAD4-R361H',
    #'PCBP1', 'PCBP1:100P', 'PCBP1:100Q',
    #'PCBP1:dKH',
]
random_proteins = [
    'CAPNS2',
    #'CCIN', # Dataset too small.
    'CDK4', 'CHMP3',
    'DCTN6',
    'EPB41L5',  # Dataset too small.
    'ETS2', 'IDE',
    'ITPA', 'TPGS2', 'UBA2',
    'HCT116',
    ]
def pos(_df):
    return _df.loc[:,[c for c in _df.columns if any([x in c for x in positive_proteins])]]

negatives = sameRiver.negativeCounts.negativeCounts(negative_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')
positives = sameRiver.positiveCounts.positiveCounts(positive_metadata, xl_rate_fname='/Users/dp/pma/percentCrosslinked.xlsx')

def CV_low(arr):
    #if arr.dtype.kind not in 'bifc':
    #    return ''
    arr = np.array(arr)
    return np.min(arr)#stats.norm.interval(0.95, loc=np.mean(arr), scale=np.std(arr))
df = negatives.reads_per_million.df.copy()
neg_min = df.loc[:, [df[col].dtype.kind in 'bifc' for col in df.columns]].apply(np.min, axis=1)
neg_max = df.loc[:, [df[col].dtype.kind in 'bifc' for col in df.columns]].apply(np.max, axis=1)

df['_sum'] = df.mean(axis=1).tolist()
#print(df.head())
df = df.loc[:,['_sum']]
to_negative_sum = dict(zip(df.index, df._sum))
to_neg_min = dict(zip(df.index, neg_min))
to_neg_max = dict(zip(df.index, neg_max))
positives.reads_per_million.df['negatives'] = [to_negative_sum.get(x, 0) for x in positives.reads_per_million.df.index]
positives.reads_per_million.df['neg_min'] = [to_neg_min.get(x, 0) for x in positives.reads_per_million.df.index]
positives.reads_per_million.df['neg_max'] = [to_neg_max.get(x, 0) for x in positives.reads_per_million.df.index]
#print(a)
import matplotlib.pyplot as plt
from scipy import stats

def formed(series):
    if series.dtype.kind not in 'bifc':
        return False
    a = np.log10(series[series>0])
    if len(a) < 10:
        return False
    return a

def histy(df, no_unknown=False, no_neg=False):
    plt.clf(); plt.close()
    plotted = []
    
    for col in df.columns:
        alpha, color, lw = 0.2, 'k', 1.
        
        is_pos = any([x in col for x in positive_proteins])
        is_neg = any([x in col for x in random_proteins])

        if no_neg and is_neg:
            continue
            
        if is_pos:
            color = 'r'
        elif is_neg:
            color = 'k'
        elif col in ['negatives']:
            color = 'g'
            alpha=1.
            lw = 2
        elif no_unknown:
            continue
        else:
            color = 'b'
    
        if type(a := formed(df[col])) != type(True):
            density = stats.gaussian_kde(a)
            #plt.hist(a, bins=20, alpha=0.2, color=color, histtype='step')
            x = np.linspace(-0.1, 4)
            plt.plot(x, density(x), color=color, alpha=alpha, lw=lw)
            plotted.append(col)
#        plt.xlim(-0.1, 4)
    plt.show()
    plt.clf(); plt.close()
    print(f"Plotted: {plotted}")
histy(df)
histy(positives.raw_reads_per_gene.df, no_unknown=True)
histy(positives.reads_per_million.df, no_unknown=True, no_neg=True)
histy(positives.reads_per_protein.df, no_unknown=True)

## Edit a stats_in_scheme.xlsx file to clean up some columns.

In [None]:
import os, sys, re, glob, pandas, importlib, dill
import numpy as np
sys.path.append('/Users/dp/pma/')
sys.path.append('/Users/dp/pma/RBP missense mutations/')
#import sameRiver
import xlLoader
import matplotlib.pyplot as plt
import seaborn as sns

known_rbps = [
    'U2AF1', 
    #'DDX3X', 
    'NOVA1', 'DDX50', 'RBM39', 
    'YTHDC2', 
    'SF3B1',  'NUFIP', 'PCBP1', 
    'SRSF2', 'RBM11', 'DICER1', 'RBFOX1',
    'hnRNP C', 'FBL', 'eIF4H', 'CELF1', 'hnRNP D', 'STAU1',
    'DHX21', 'NSUN2', 'RBFOX2',
    
    'RPL5', 'RPL5-E82K', 'RPL5 E82K',
    'KHDRBS2', 'KHDRBS2-R168C', 'KHDRBS2 R168C',
    'A1CF', 'A1CF-E34K', 'A1CF E34K',
#    'RARS2', 'RARS2-R6C',
    'FUBP1', 'FUBP1-R429C', 'FUBP1 R429C',
]
known_rbps.extend([x.upper() for x in known_rbps])
known_rbps.extend([x.replace(' ', '').upper() for x in known_rbps])

putative = [
    'SMAD3', 'SMAD4', 'BRCA1', 'BARD1',
    'CNOT9', 'HNRNPCL1', 'CNOT1', 'EIF1AX',
    'PABPC4L',    
    'DCP1B', 'BCLAF1',
    'CRNKL1', 'CRNKL1-S128F',
    'SMAD4', 'SMAD4-R361H',
    
    #'RPL5',
]

non_rbp = [
    'CAPNS2','CCIN',
    'CDK4','CHMP3','DCTN6','EGFP',
    'EPB41L5',
    'ETS2','IDE','ITPA','TPGS2','UBA2'
]

incertae_sedis = [
    'TDRKH', 'EEF1B2', 
    'RARS2', 'RARS2-R6C', 'RARS2 R6C']

def categorize(protein):
    if protein in known_rbps:
        return 'RBP'
    elif protein in putative:
        return 'Putative'
    elif protein in non_rbp:
        return 'non-RBP'
    elif protein in incertae_sedis:
        return 'Incertae sedis'
    else:
        return 'Mutant?'
    
def correct(protein):
    protein = protein.replace('PCBP1 100', 'PCBP1 L100')
    protein = protein.replace('hnRNP C', 'hnRNPC')
    protein = protein.replace('hnRNP D', 'hnRNPD')
    protein = protein.upper()
    return protein

pma_dir = '/Users/dp/pma/'
xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(xl_rate, recurrent) = xlLoad.load()
xl_rate['Protein'] = [correct(p) for p in xl_rate['Protein']]
xl_rate['Category'] = [categorize(p) for p in xl_rate.Protein]
#xl_rate = xl_rate.loc[[x!='Mutant?' for x in xl_rate.Category], :]
#print(xl_rate)

def mean_of_label(label, xl_df):
    sub = xl_df.loc[[x==label for x in xl_df['Label']], :]
    g = dict(sub.groupby(by=['Protein'])['Value'].apply(np.mean))
    #print(g)
    return g

xl_means = {}
for col in set(xl_rate.Label):
    try:
        xl_means[col] = mean_of_label(col, xl_rate)
    except:
        pass  # Not numeric.
xl_rate_mean = xl_means['% XL (minimal region)']

xl_means['fmol RNA (whole lane)'] = {
    protein:xl_means['% XL (whole lane)'].get(protein, -1E6)*100*xl_means['pmol protein'].get(protein, -1E6)*1000 for protein in set(xl_rate.Protein)}
#xl_rate_mean = xl_rate_mean.T.to_dict()

def _fl(x):
    if type(x) != type(''):
        return float(x)
    return float(x.replace(',', ''))

fname = '/Users/dp/pma/dataAndScripts/clip/miseq/Runs/200327/stats_in_scheme.xlsx'
fname = '/Users/dp/pma/stats_in_scheme.xlsx'
fname = '/Users/dp/pma/dataAndScripts/clip/RepEnrich2/meta/stats_in_scheme.xlsx'

stats_df = pandas.read_excel(fname)
stats_df['Gene'] = [x.replace(':', ' ') for x in stats_df.Gene]
stats_df['Protein'] = [correct(p) for p in stats_df['Gene']]
stats_df['Category'] = [categorize(p) for p in stats_df.Protein]
#stats_df = stats_df.loc[[x!='Mutant?' for x in stats_df.Category], :]
stats_df.reset_index(inplace=True)


stats_df['% XL (minimal region)'] = [xl_rate_mean.get(protein.replace('-', ' '), np.nan) for protein in stats_df.Protein]
for col, _dict in xl_means.items():
    stats_df[col] = [_dict.get(protein, np.nan) for protein in stats_df.Protein]
    
print(stats_df.head())
print('------')

#stats_df['Mapped reads (bedgraph)'] = [ for x in stats_df['Mapped reads (bedgraph)']]
stats_df['% left split reads after inital removal of empty adapters'] = [
    100*x/y for x,y in zip(stats_df['r1r2_clipped Reads'], stats_df['r1r2_split Reads'])] 
stats_df['% mapped of split reads'] = [
    100*_fl(x)/y for x,y in zip(stats_df['Mapped reads (bedgraph)'], stats_df['r1r2_split Reads'])] 
stats_df['% mapped of non-empty reads'] = [
    100*_fl(x)/y for x,y in zip(stats_df['Mapped reads (bedgraph)'], stats_df['r1r2_clipped Reads'])] 

spat = re.compile('(.+) \((.+)%\)')

def separate_percents(_str):
    """Expect _str is of the format '2,122,888 (40.7%)'."""
    if type(_str) != type(''):
        return 0, 0
    pat = spat.search(_str)
    if pat is not None:
        perc = pat.group(2)
        count = int(pat.group(1).replace(',', ''))
        return count, perc
    return 0, 0


for col in [
    'Mapped reads at autosomes',
    'Mapped reads at chrX',
    'Mapped reads at chrY',
    'Mapped reads at rRNA',
    'Mapped reads at Other genomic sequence',
    'Mapped reads at repeats'
    ]:
    count_perc = [separate_percents(_str) for _str in stats_df[col]]
    stats_df[col + ' (N)'] = [float(x[0]) for x in count_perc]
    stats_df[col + ' (%)'] = [float(x[1]) for x in count_perc]
    
    
def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
    """Scatter plot with country codes on the x y coordinates
       Based on this answer: https://stackoverflow.com/a/54789170/2641825"""
    # Create the scatter plot
    p1 = sns.scatterplot(x, y, data=data, size = 8, legend=False)
    # Add text besides each point

    # Set title and axis labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    return p1

def make_scatterplot_and_excel_file(stats_df, x='% XL (minimal region)', y='% XL (minimal region)'):
    stats_df['log10 ' + x] = np.log10(stats_df[x])
    x='log10 ' + x
    #y='% mapped of non-empty reads'
    #y='% left split reads after inital removal of empty adapters'
    p1 = sns.lmplot(x=x,y=y,
               #y='% left split reads after inital removal of empty adapters',
              hue='Category', data=stats_df, fit_reg=False)


    for line in range(0,stats_df.shape[0]):
        if not np.isnan(stats_df[x][line]):
            pass
            #if stats_df['Category'][line] != 'Mutant?':
            #    continue
            #p1.axes.flatten()[0].text(stats_df[x][line]+0.01, stats_df[y][line], 
            #         stats_df['Protein'][line], horizontalalignment='left', 
            #         size='small', color='black')#, weight='semibold')
    plt.show(); plt.clf(); plt.close()

    #print(stats_df.head())


def write_excel(fname, stats_df):
    writer = pandas.ExcelWriter(os.path.dirname(fname) + '/edited_stats_in_scheme.xlsx', engine='xlsxwriter')
    stats_df.to_excel(writer, sheet_name='Sheet1')
    # Get the xlsxwriter workbook and worksheet objects.
    workbook  = writer.book
    worksheet = writer.sheets['Sheet1']

    # Apply a conditional format to the cell range.
    first_row = 1
    last_row = len(stats_df.index)

    col_range = {}
    for col_number, colname in enumerate(stats_df.columns, start=1):
        col_range[colname] = (first_row, col_number, last_row, col_number)
        
    import xlsxwriter
    for colname in stats_df.columns:
        worksheet.conditional_format(xlsxwriter.utility.xl_range(*col_range[colname]),
             {'type': '3_color_scale', 'min_color': 'green', 'mid_color': 'white', 'max_color': 'red'})

    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

write_excel('/Users/dp/pma/dataAndScripts/clip/RepEnrich2/meta/', stats_df)
make_scatterplot_and_excel_file(stats_df, x='fmol RNA (minimal region)', y='pmol protein')
make_scatterplot_and_excel_file(stats_df, x='fmol RNA (whole lane)',  y='% left split reads after inital removal of empty adapters')
make_scatterplot_and_excel_file(stats_df, x='fmol RNA (minimal region)', y='% mapped of split reads')
make_scatterplot_and_excel_file(stats_df, x='% XL (minimal region)', y='% mapped of split reads')
make_scatterplot_and_excel_file(stats_df, x='% XL (minimal region)', y='% left split reads after inital removal of empty adapters')
make_scatterplot_and_excel_file(stats_df, x='% left split reads after inital removal of empty adapters', y='% mapped of split reads')
make_scatterplot_and_excel_file(stats_df, x='fmol RNA (minimal region)', y='% left split reads after inital removal of empty adapters')
make_scatterplot_and_excel_file(stats_df, x='% XL (minimal region)', y='Mapped reads at autosomes (%)')
make_scatterplot_and_excel_file(stats_df, x='fmol RNA  (whole lane)',  y='% left split reads after inital removal of empty adapters')