## Analysis of cross-regulation predictions of SRSF7
### Different parameters

In [1]:
# packages
import pandas as pd
import seaborn as sns

In [3]:
# data
srsf7_targets_500_11 = pd.read_table('./data/outputs/cross_peaks_nmd_SRSF7_500_11.tsv', header=None)
#srsf7_targets_500_11p3 = pd.read_table('./data/outputs/cross_peaks_nmd_SRSF7_500_11p3.tsv', header=None)
NMD_KD = pd.read_table('./data/input_data/upf1xrn1_deltaPSI.tsv')
NMD_KD.head()
for_eCLIPs = pd.read_table('./data/outputs/exon_peaks.bed', header=None)
# gcn_data
srsf7_gcn = pd.read_table('./srsf7.tsv')

In [4]:
# nmd_kd parsing
nmd_kd = NMD_KD[['id', 'gene', 'deltaPSI.UPF1','p.UPF1','q.UPF1','deltaPSI.SMG6', 'p.SMG6', 'q.SMG6']]
nmd_kd = nmd_kd.rename(columns={'id':'exon'})
#nmd_kd[nmd_kd.gene == 'SRSF7']

In [5]:
# eCLIPs parsing
eCLIPs = for_eCLIPs.iloc[:, [3,6,10,12,13,14,15,16]]
eCLIPs.columns = ['exon', 'target', 'factor', 'strand', 'eCLIPs cell line', 'rep', 'eCLIP_p-value', 'eCLIP_logFC']
eCLIPs.head()

Unnamed: 0,exon,target,factor,strand,eCLIPs cell line,rep,eCLIP_p-value,eCLIP_logFC
0,chr1_137621_139379_-,AL627309.1,ILF3,-,K562,rep02,2.421904,5.174009
1,chr1_137621_139379_-,AL627309.1,HNRNPK,-,K562,rep02,3.064845,5.305207
2,chr1_137621_139379_-,AL627309.1,BUD13,-,K562,rep02,2.653889,3.067285
3,chr1_137621_139379_-,AL627309.1,HNRNPK,-,K562,rep01,1.620957,1.052859
4,chr1_137621_139379_-,AL627309.1,PUM1,-,K562,rep02,3.858528,5.736128


In [7]:
srsf7_targets_500_11.columns = ['factor', 'cell line', 'exon', 'target', 'deltaPSI', 'deltaPSIc', 'z', 'p', 'q', 'KDFC']
#srsf7_targets_500_11p3.columns = ['factor', 'cell line', 'exon', 'target', 'deltaPSI', 'deltaPSIc', 'z', 'p', 'q', 'KDFC']
srsf7_targets_500_11.head()

Unnamed: 0,factor,cell line,exon,target,deltaPSI,deltaPSIc,z,p,q,KDFC
0,SRSF7,HepG2,chr15_37186953_37187013_-,MEIS2,0.29,0.29,1.34,1.05,0.08,0.54
1,SRSF7,HepG2,chr3_160118610_160118753_+,SMC4,-0.61,-0.6,-2.67,2.42,0.52,0.54
2,SRSF7,HepG2,chr10_70276841_70277022_-,SLC25A16,0.22,0.22,1.72,1.37,0.08,0.54
3,SRSF7,HepG2,chr12_57999354_57999514_+,DTX3,-0.29,-0.29,-2.13,1.78,0.22,0.54
4,SRSF7,HepG2,chr17_28490085_28490180_+,NSRP1,0.16,0.17,1.34,1.05,0.08,0.54


In [8]:
# merging two datasets for q-values and eCLIP thresholds = 1
two_kds_500_11 = pd.merge(nmd_kd, srsf7_targets_500_11, on='exon')
two_kds_500_11short = two_kds_500_11[['exon', 'target', 'factor', 'deltaPSI.UPF1', 'p.UPF1', 'q.UPF1', \
                'deltaPSI.SMG6', 'p.SMG6','q.SMG6', 'cell line', 'deltaPSI', 'p', 'q']]

In [11]:
# merging two datasets for q-values for KDs and eCLIP thresholds = 1.3 and 1
#two_kds_500_11p3 = pd.merge(nmd_kd, srsf7_targets_500_11p3, on='exon')
#two_kds_500_11p3short = two_kds_500_11p3[['exon', 'target', 'factor', 'deltaPSI.UPF1', 'p.UPF1', 'q.UPF1', \
                #'deltaPSI.SMG6', 'p.SMG6','q.SMG6', 'cell line', 'deltaPSI', 'p', 'q']]

In [12]:
# merging two_kds with eCLIPs
kds_eCLIPS_500_11 = pd.merge(two_kds_500_11short, eCLIPs, on=['exon', 'target', 'factor'])
#kds_eCLIPS_500_11p3 = pd.merge(two_kds_500_11p3short, eCLIPs, on=['exon', 'target', 'factor'])

In [None]:
# semi-final
cross_peaks_500_11 = kds_eCLIPS_500_11[['exon', 'target', 'factor', 'deltaPSI.UPF1', 'p.UPF1', 'q.UPF1', \
                'deltaPSI.SMG6', 'p.SMG6','q.SMG6', 'deltaPSI', 'p', 'q', 'eCLIP_p-value', 'eCLIP_logFC']]
#cross_peaks_500_11p3 = kds_eCLIPS_500_11p3[['exon', 'target', 'factor', 'deltaPSI.UPF1', 'p.UPF1', 'q.UPF1', \
                'deltaPSI.SMG6', 'p.SMG6','q.SMG6', 'deltaPSI', 'p', 'q', 'eCLIP_p-value', 'eCLIP_logFC']]

In [15]:
# merging with gcn data
cross_peaks_500_11_gcn = pd.merge(cross_peaks_500_11, srsf7_gcn, how='left', on=['factor','target'])
#cross_peaks_500_11p3_gcn = pd.merge(cross_peaks_500_11p3, srsf7_gcn, how='left', on=['factor','target'])



In [29]:
factor_list = cross_peaks_500_11_gcn.sort_values('p',ascending=False)['target'].unique()[:25]
SRSF_relaxed_25_factors = cross_peaks_500_11_gcn[cross_peaks_500_11_gcn['target'].isin(factor_list)].sort_values('p',ascending=False)
SRSF_relaxed_25_factors.to_csv('data/outputs/SRSF7_25factors_p-value.tsv', sep = '\t')

In [84]:
# saving output as tsv
cross_peaks_500_11_gcn.to_csv('data/outputs/final_table_500_11.tsv', sep = '\t')
cross_peaks_500_11p3_gcn.to_csv('data/outputs/final_table_500_11p3.tsv', sep = '\t')
