In [None]:
"""
A. For each tissue, enrichment of each motif in "eSTRs" vs. all analyzed STRs as background. Just a simple fisher exact test for this.
"""

# Libraries
import matplotlib.pyplot as plt
from matplotlib import gridspec
import numpy as np
import os
import pandas as pd
import scipy.stats as ss
import seaborn.apionly as sns

COLORS = {
    "Artery-Aorta":"salmon",
    "Artery-Tibial": "red",
    "Adipose-Subcutaneous": "darkorange",    
    "Adipose-Visceral(Omentum)":"orange",
    "Brain-Caudate(basalganglia)":"lemonchiffon"   , 
    "Brain-Cerebellum":"yellow",
    "Cells-Transformedfibroblasts": "skyblue",
    "Esophagus-Mucosa": "sienna",
    "Esophagus-Muscularis":"burlywood",
    "Heart-LeftVentricle":"darkviolet",
    "Lung": "greenyellow",
    "Muscle-Skeletal": "mediumslateblue",
    "Nerve-Tibial":"gold",
    "Skin-NotSunExposed(Suprapubic)":"blue",
    "Skin-SunExposed(Lowerleg)":"cornflowerblue",
    "Thyroid":"green",
    "WholeBlood": "m",
    "permuted": "gray"
}

#    "Thyroid": "green",
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral(Omentum)":"Adipose.V",
    "Brain-Caudate(basalganglia)":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Ventricule",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve",
    "Skin-NotSunExposed(Suprapubic)": "SkinUnexposed",
    "Skin-SunExposed(Lowerleg)":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood",
    "permuted":"Permuted",
    "LCL": "LCL"
}

def Findoverlap(Frame, feat ):
    L=sorted(set(list(Frame['chrom'])))
    fragments=[]
    t=0
    genes=[]
    for C in L:
        X = Frame.loc[Frame['chrom']==C]
        Y = feat.loc[feat['chrom']==C]
        #print(X.shape, Y.shape)
        X['str.start'] = X["str.start"].astype(int)
        L1 = list(X['str.start'])
        L2=[]
        for start in L1:
            X2 = Y.loc[(Y["start"]<=start) & (Y["stop"]>=start)]
            if X2.shape[0]==0:
                continue
            else:
                L2.append(start)        
        fragments.append(X.loc[X['str.start'].isin(L2)])
        #print(C,'\t',fragments[-1].shape)
    result = pd.concat(fragments)
    print(Frame.shape, '\t', result.shape)
    return(result)

TISSUES = [item for item in list(COLORS.keys()) if item != "permuted"]

regr='/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/LR_SummaryTest_Table.tsv'
feat='/storage/szfeupe/Runs/GTEx_estr/FEATURES/Allgencodefeatures_table'
motif='/storage/resources/dbase/human/hg19/hg19.hipstr_reference_withmotif.bed'
Reg1 = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Merged_Best_causality.Table'





F=[] ; G=[] ; R = []
#open regression
R = pd.read_csv(regr, sep='\t')  ; X = R[["gene","chrom","str.start","E.tissues","motif"]]
X['best.str.start'] = X['str.start']
#open Causal annot
C = pd.read_csv(Reg1, sep='\t')
G=pd.merge(C,X, on=['gene','chrom','best.str.start'])
#open annotation
FT = pd.read_csv(feat, sep='\t')
FTGene = FT.loc[FT['feature.type']=='gene']
G =Findoverlap(G,FTGene)

##@ Causal STR
data = G
#data1 = G[G['top.variant'].str.contains("STR_")==True]
#data = G.loc[G['E.tissues']>0]
#data_e = data1.loc[data1["E.tissues"]>0]
data1 = data[data['top.variant'].str.contains("STR_")==True]
data_e = data1.loc[data1["E.tissues"]>0]
#Enrichment

Estr = data_e.shape[0]
Str = data.shape[0]
Motifs = list(set(list(data_e['motif'])))
Mo_counts=[list(data_e['motif']).count(x) for x in Motifs]
All = []
sig_motifs=[]
for M in Motifs:
    data_eM = data_e.loc[data_e['motif']==M]
    data_M = data.loc[data['motif']==M]
    Mestrs = data_eM.shape[0]
    Mstrs = data_M.shape[0]
    CTT = [[Mestrs, Estr - Mestrs] , [Mstrs-Mestrs , Str-Estr -(Mstrs - Mestrs)]] #contengency
    oddsratio, pv = ss.fisher_exact([CTT[0], CTT[1]])
    All.append([M,list(data_e['motif']).count(M),-np.log10(pv),oddsratio])
    if pv<=0.05:
        sig_motifs.append([M,pv*100,oddsratio])
p=Estr/data.shape[0]

#print("Likelihood ratio of an'AC-rich' eSTR is ",(data_eac.shape[0]/data_e.shape[0])/((data_ac.shape[0]-data_eac.shape[0])/(data_ac.shape[0])) )

count_uniq_mo = sorted(list(set([x[2] for x in All])))
ordered_All = []
for u in count_uniq_mo:
    ordered_All = ordered_All + [x for x in All if x[2]==u] 


plt.figure(figsize=(15,5))
Y=[x[2] for x in ordered_All]
X=[x for x in range(len(ordered_All))]
#Y = Y[:-1] ; X = X[:-1]
col = 'b' #[COLORS[tis] if x>-np.log10(0.05) else 'k' for x in Y ]
#plt.scatter(X, Y, c=col, s=20, linewidth=1)
plt.bar(X, Y, color=col)
plt.xlabel('Motifs')
plt.ylabel('-log10(pvalues) motif enrichment')
plt.title('Causal eSTRs Motif enrichment (#eSTRs='+str(data_e.shape[0])+') vs all gene STRs')
plt.xticks(X,[x[0] for x in ordered_All], rotation='vertical')
plt.axhline(y=-np.log10(0.05), xmin=0, xmax=1, hold=None, alpha=0.5, ls='--')
#plt.axhline(y=-np.log10(0.1), xmin=0, xmax=1, hold=None, alpha=0.5, ls='--')
plt.text(1, 1.5, 'pvalue=0.05', color='grey')
plt.show()  
    
#plt.savefig("motifsE.png")   

"""
Signigicantly enriched motifs
'AGATAT'*, 'AAAG', 'AAATG', 'AGGGGG', 'ACGCC', 'AGAG', 'AACC', 'AAAAG', 'AATG', 'AAAT', 'AT', 'AC'
"""
