In [1]:
import pandas as pd
import numpy as np
import pyBigWig

# 文件路径
gtf_file = "/data/haocheng/data/DNA/Homo_sapiens.GRCh38.104.gtf"
bigwig_file = "/data/haocheng/data/bam/result/GM12878.bigwig"
expression_file = "/data/haocheng/data/gene_expressiom/ENCFF345SHY.tsv"

In [2]:

genes = []

# 允许使用不带 'chr' 前缀的染色体名称
valid_chromosomes = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY'] + [str(i) for i in range(1, 23)] + ['X', 'Y']

with open(gtf_file, 'r') as f:
    for line in f:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] == 'gene':
            info = {x.split(' ')[0]: x.split(' ')[1].strip('";') for x in fields[8].split('; ')}
            if info.get('gene_biotype') == 'protein_coding':
                chrom = fields[0]
                if chrom in valid_chromosomes:
                    start = int(fields[3]) - 1  # 将起始位置转换为零基
                    end = int(fields[4])
                    gene_id = info['gene_id']
                    genes.append([chrom, start, end, gene_id])
                else:
                    print(f"Skipped non-chromosomal gene: {info['gene_id']} in {chrom}")  # 仅打印非染色体基因的基因ID

genes_df = pd.DataFrame(genes, columns=['chrom', 'start', 'end', 'gene_id'])

# 仅在没有找到基因时打印
if genes_df.empty:
    print("No genes found.")
else:
    print(f"Total genes found: {len(genes_df)}")

Skipped non-chromosomal gene: ENSG00000198888 in MT
Skipped non-chromosomal gene: ENSG00000198763 in MT
Skipped non-chromosomal gene: ENSG00000198804 in MT
Skipped non-chromosomal gene: ENSG00000198712 in MT
Skipped non-chromosomal gene: ENSG00000228253 in MT
Skipped non-chromosomal gene: ENSG00000198899 in MT
Skipped non-chromosomal gene: ENSG00000198938 in MT
Skipped non-chromosomal gene: ENSG00000198840 in MT
Skipped non-chromosomal gene: ENSG00000212907 in MT
Skipped non-chromosomal gene: ENSG00000198886 in MT
Skipped non-chromosomal gene: ENSG00000198786 in MT
Skipped non-chromosomal gene: ENSG00000198695 in MT
Skipped non-chromosomal gene: ENSG00000198727 in MT
Skipped non-chromosomal gene: ENSG00000274175 in KI270728.1
Skipped non-chromosomal gene: ENSG00000273554 in KI270728.1
Skipped non-chromosomal gene: ENSG00000278782 in KI270728.1
Skipped non-chromosomal gene: ENSG00000277761 in KI270728.1
Skipped non-chromosomal gene: ENSG00000275869 in KI270728.1
Skipped non-chromosomal 

In [3]:
# 查看 GTF 文件的前 5 行
with open(gtf_file, 'r') as f:
    for i in range(5):  # 打印前 5 行
        print(f.readline().strip())
print(genes_df)


#!genome-build GRCh38.p13
#!genome-version GRCh38
#!genome-date 2013-12
#!genome-build-accession GCA_000001405.28
#!genebuild-last-updated 2021-03
      chrom     start       end          gene_id
0         1    685678    686673  ENSG00000284662
1         1   1211339   1214153  ENSG00000186827
2         1   1203507   1206592  ENSG00000186891
3         1   1471764   1497848  ENSG00000160072
4         1   6624865   6635586  ENSG00000041988
...     ...       ...       ...              ...
19919    21  34513141  34615113  ENSG00000159200
19920    21  36156781  36294274  ENSG00000142197
19921    21  15729981  15880064  ENSG00000155313
19922    21   6499202   6564489  ENSG00000276076
19923    21  31118415  31559977  ENSG00000156299

[19924 rows x 4 columns]


In [3]:
# 步骤2：提取 BigWig 文件中的信号值
bw = pyBigWig.open(bigwig_file)

# 获取所有信号值的函数
def extract_all_signals(bw, genes_df, flanking=500):
    # 处理染色体标识
    genes_df['chrom'] = 'chr' + genes_df['chrom'].astype(str)
    
    # 计算前后区间
    flanking_starts = np.maximum(0, genes_df['start'] - flanking)
    flanking_ends = genes_df['end'] + flanking

    # 提取所有信号值
    all_signal_values = []
    for chrom, start, end in zip(genes_df['chrom'], flanking_starts, flanking_ends):
        signal_values = bw.values(chrom, start, end, numpy=True)
        all_signal_values.append(signal_values)

    # 创建区间的信号值列表
    intervals_with_signals = []
    for i, (start, end, signals) in enumerate(zip(flanking_starts, flanking_ends, all_signal_values)):
        intervals_with_signals.append([
            {'start': start + j, 'end': start + j + 1, 'signal': signals[j]}
            for j in range(len(signals))
        ])
    
    return intervals_with_signals

# 提取信号值并添加到 DataFrame
genes_df['signal_intervals'] = extract_all_signals(bw, genes_df)


: 

In [3]:
# 步骤2：提取 BigWig 文件中的信号值
bw = pyBigWig.open(bigwig_file)

def extract_signals_in_chunks(bw, genes_df, chunk_size=1000, flanking=500):
    # 处理染色体标识
    genes_df['chrom'] = 'chr' + genes_df['chrom'].astype(str)
    
    # 计算前后区间
    flanking_starts = np.maximum(0, genes_df['start'] - flanking)
    flanking_ends = genes_df['end'] + flanking

    intervals_with_signals = []

    # 按块处理基因数据
    for start in range(0, len(genes_df), chunk_size):
        end = min(start + chunk_size, len(genes_df))
        chunk = genes_df.iloc[start:end]

        # 提取所有信号值
        signal_values = np.concatenate([
            bw.values(chrom, start, end, numpy=True)
            for chrom, start, end in zip(chunk['chrom'], flanking_starts[start:end], flanking_ends[start:end])
        ])

        # 创建区间的信号值列表
        for i, (flanking_start, flanking_end) in enumerate(zip(flanking_starts[start:end], flanking_ends[start:end])):
            intervals_with_signals.append([
                {'start': flanking_start + j, 'end': flanking_start + j + 1, 'signal': signal_values[j]}
                for j in range(flanking_end - flanking_start)
            ])
        
        # 打印当前处理的块
        print(f"处理完块: {start // chunk_size + 1}")

    return intervals_with_signals
# 提取信号值并添加到 DataFrame
# 提取信号值并添加到 DataFrame
genes_df['signal_intervals'] = extract_signals_in_chunks(bw, genes_df)

# 打印结果


处理完块: 1
处理完块: 2
处理完块: 3
处理完块: 4
处理完块: 5
处理完块: 6
处理完块: 7
处理完块: 8


In [None]:

# 步骤3：读取基因表达量数据
expression_df = pd.read_csv(expression_file, sep='\t')
expression_df['log_tpm'] = np.log1p(expression_df['TPM'])  # log1p 处理

# 步骤4：将基因信息与表达量数据结合
input_df = pd.merge(genes_df, expression_df[['gene_id', 'log_tpm']], on='gene_id')

# 步骤5：创建特征向量
def create_feature_vector(signal, log_tpm):
    return np.concatenate([signal, [log_tpm]])

# 创建特征矩阵
input_df['feature_vector'] = input_df.apply(lambda row: create_feature_vector(row['signal'], row['log_tpm']), axis=1)
feature_vectors = np.array(input_df['feature_vector'].tolist())

# 步骤6：准备输入和输出
X = feature_vectors  # 特征矩阵
y = input_df['log_tpm'].values  # 目标变量

# 确认向量格式
print("Feature vector shape:", X.shape)  # 应该是 (样本数, 特征长度)
print("Target vector shape:", y.shape)  # 应该是 (样本数,)

# 步骤7：保存向量格式数据
np.save("X.npy", X)  # 保存特征向量
np.save("y.npy", y)  # 保存目标变量

# 关闭 BigWig 文件
bw.close()
