In [None]:
import sys
import os
import pandas as pd
import numpy as np

In [None]:
gencode_config_template=os.path.join("gencode_config_TEMPLATE.txt")
reg_config_template=os.path.join("regulatory_config_TEMPLATE.txt")
submit_template_file=os.path.join("TEMPLATE_characterize.txt")

suffix='ALL_V4'
out_folder=os.path.join("..","data","quantification")
characterization_folder=os.path.join(out_folder,'characterization_{}'.format(suffix))

In [None]:
gencode_characterization_template="gencode_characterize_{}_finalhits.txt"
reg_characterization_template="reg_characterize_{}_finalhits.txt"

In [None]:
def get_characterization(batch_id):
    gencode_characterization_file=os.path.join(characterization_folder,gencode_characterization_template.format(batch_id))
    gencode_characterization=pd.read_csv(gencode_characterization_file,sep='\t')
    gencode_characterization = gencode_characterization.set_index("peak_id")
    gencode_characterization.loc[gencode_characterization['feature']=='transcript','feat_type']='transcript:'+gencode_characterization.loc[gencode_characterization['feature']=='transcript','transcript_type']
    gencode_characterization.loc[gencode_characterization['feature']=='gene','feat_type']='gene:'+gencode_characterization.loc[gencode_characterization['feature']=='gene','gene_type']
    gencode_characterization['length']=gencode_characterization['peak_end']-gencode_characterization['peak_start']
    gencode_characterization=gencode_characterization[['peak_chr','peak_start','peak_end','length','feat_anchor','distance','relative_location','feat_type','gene_id','gene_name','name']]
    gencode_characterization.columns=['chr','start','end','length','feat_anchor','distance','location','feat_type','gene_id','gene_name','characterization']
    gencode_characterization.loc[gencode_characterization['characterization'].isna(),'characterization']='NONE'

    reg_characterization_file=os.path.join(characterization_folder,reg_characterization_template.format(batch_id))
    reg_characterization=pd.read_csv(reg_characterization_file,sep='\t')
    reg_characterization = reg_characterization.set_index('peak_id')[['feature','ID']]
    reg_characterization.columns=['reg_feature','reg_feature_id']
    reg_characterization.loc[reg_characterization['reg_feature'].isna(),'reg_feature']='reg_NONE'
    
    return gencode_characterization.join(reg_characterization)


In [None]:
base_character = get_characterization("base")
base_character.to_csv(os.path.join(out_folder,"peaks_characterization_{}.csv".format(suffix)),index_label='peak_id')

In [None]:
results=list()
for i in range(0,100):
    if ((i % 10)==0):
        print(i)
    character= get_characterization(str(i))
    results.append(character.groupby('characterization').size().append(character.groupby('reg_feature').size()))

In [None]:
from scipy.stats import ttest_1samp

In [None]:
randomized_counts = pd.concat(results,axis=1).fillna(0)

In [None]:
base_counts=base_character.groupby('characterization').size().append(base_character.groupby('reg_feature').size())

In [None]:
tot_peaks=base_counts.sum()/2
base_counts = base_counts.reindex(randomized_counts.index).fillna(0)

In [None]:
pvalues = ttest_1samp(randomized_counts.T,base_counts).pvalue
random_mean = randomized_counts.mean(axis=1)
lfc = np.log2(base_counts/random_mean)
random_mean = random_mean/tot_peaks*100.0
base_counts=base_counts/tot_peaks*100.0

In [None]:
result = base_counts.to_frame('peaks(%)').join(random_mean.to_frame('random(%)')).join(lfc.to_frame('LFC'))
result['p']=pvalues

In [None]:
result

In [None]:
result.to_csv(os.path.join(out_folder,"QC_characterization_{}_enrichment.csv".format(suffix)),index_label='element_type')

### Inspect feature length distribution

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
length=base_character['length']
q=0.99
quantile = np.quantile(length,q=[q])[0]
median = np.quantile(length,q=[0.5])[0]

In [None]:
fig,ax = plt.subplots(figsize=(6,4))
ax = sns.kdeplot(length,cumulative=True,legend=False)
_=ax.set_xlim(450,3000)
_=ax.axhline(0.99,color='#949596',linestyle='--')
_=ax.axvline(quantile,color='#949596',linestyle='--')
_=ax.axhline(0.5,color='#949596',linestyle='--')
_=ax.axvline(median,color='#949596',linestyle='--')
_=ax.set_xlabel('Element length (bp)')
_=ax.set_ylabel('Cumulative Fraction')
_=ax.text(quantile+10,1,"{:.1f}% - {}".format(q*100,quantile))
_=ax.text(median+10,0.45,"{:.1f}% - {}".format(50,median))
fig.savefig(os.path.join(out_folder,'QC_peak_{}_lenghts.svg'.format(suffix)))