In [2]:
import pybedtools
import pandas as pd
import os
import sys

sys.path.append("/ifs3/clliu/tools/")
from utils import ensure_directory_exists,aggregate_dataframe

In [96]:
# T: Tumor; N: Normal
# A:H3K4me3
# B:H3K9me3
# C:H3K27me3
# D:H3K27ac

for var in ['H3K4me3','H3K9me3','H3K27me3','H3K27ac']:
    prefix = var
    sample_type = 'N'





    if prefix == 'H3K9me3' or prefix == 'H3K27me3':
        calculate_type = 'transcript_anno_duplicates.bed'
    else:
        calculate_type = 'transcript_tsses_-10-4kb.bed'

    rna = "/ifs3/scdata/RNA-Seq_counts/sample_" + sample_type
    rna_path = pd.read_csv(rna,header=None)


    chip = "/ifs3/scdata/chipseqData/"+prefix+"_"+sample_type
    chip_path = pd.read_csv(chip,header=None)

    outdir = '/ifs3/scdata/4.1ChIPseq/1.width/'+prefix
    ensure_directory_exists(outdir)


    data = pd.DataFrame(columns=['gene_name'])
    for i in range(len(rna_path)):
        input = chip_path[0][i]+"/pooled/bam_"+os.path.basename(chip_path[0][i])+".sorted.clean.dedup.bgsub.smooth.regions.xls"

        #读取danpos结果文件中peak宽度.xls并保存为bed文件,方便后续overlap
        try:
            tmpdf = pd.read_csv(input,sep='\t')
        except FileNotFoundError:
            input = chip_path[0][i]+"/pooled/"+os.path.basename(chip_path[0][i])+"_out_alignment_"+os.path.basename(chip_path[0][i])+".sorted.clean.dedup.bgsub.smooth.regions.xls"
            tmpdf = pd.read_csv(input,sep='\t')

        outdir1 = outdir + '/' + os.path.basename(chip_path[0][i])
        ensure_directory_exists(outdir1)

        tmpdf[['chr','start','end','width_above_cutoff']].to_csv(outdir1+'/'+'peak_width.bed',index=False,header=False,sep='\t')

        # overlap
        a = pybedtools.BedTool(rna_path[0][i]+'/'+calculate_type)
        b = pybedtools.BedTool(outdir1+'/'+'peak_width.bed')
        intersected = b.intersect(a, wa=True, wb=True)
        intersected.saveas(outdir1+'/'+'overlap.bed')

       #提取overlap之后的bed中的transcript_id
        tmp = pd.read_csv(outdir1+'/'+'overlap.bed',sep='\t',header=None)
        for column in tmp.columns:
            # 检查列的数据类型是否为字符串
            if isinstance(tmp[column][0], str) and tmp[column].str.startswith('gene_id').all():
                # 提取"transcript_id" 后的值
                transcript_id_values = tmp[column].str.extract(r'transcript_id ([^\s;]+)')
                gene_name_values = tmp[column].str.extract(r'gene_name ([^\s;]+)')

        gene_transcript_index = pd.DataFrame()
        gene_transcript_index['transcript_id'] = transcript_id_values
        gene_transcript_index['gene_name'] = gene_name_values
        gene_transcript_index.to_csv(outdir1+'/'+'gene_transcript_index.csv',index=False)

        #创建df用于保存转录本对应的宽度
        df = pd.DataFrame()
        df['transcript_id'] = transcript_id_values
        df[os.path.basename(chip_path[0][i])] = tmp[3]
        df = aggregate_dataframe(df)
        df.to_csv(outdir1+'/'+'transcript_width.csv',index=False)

        df = pd.merge(df,gene_transcript_index,on='transcript_id',how='outer')
        df = df.drop('transcript_id',axis=1).dropna()
        df = aggregate_dataframe(df,groupby_col = 'gene_name',agg_funcs = 'max')
        df.to_csv(outdir1+'/'+'gene_width.csv',index=False)


        #合并各个样本中的转录本宽度
        data = data.merge(df,on = 'gene_name',how='outer')
    data = data.fillna(0)
    data['mean'] = data.drop('gene_name', axis=1).mean(axis=1)
    data = data.sort_values(by='mean', ascending=False).reset_index().drop('index',axis=1)
    #保存
    data.to_csv(outdir+'/'+sample_type+'_'+'gene_sample_width.csv',index=False)

INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-016-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-018-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-019-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-020-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-021-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-023-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-024-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-025-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-026-N-A' already exists.
INFO:root:Directory '/ifs3/scdata/4.1ChIPseq/1.width/H3K4me3/CS1807-027-N-A' already 