In [6]:
import pysam
import pandas as pd
import numpy as np
import polars
from gtfparse import read_gtf
from torch.utils.data import Dataset, DataLoader
import torch
import pyBigWig


In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import pysam
import pyBigWig

class BigWigDataset(Dataset):
    def __init__(self, gtf_path, fasta_path, bigwig_path, tsv_path, gene_type='protein_coding', feature='gene', include_tpm_fpkm=True, transform=None):
        self.gtf_path = gtf_path
        self.fasta_path = fasta_path
        self.bigwig_path = bigwig_path
        self.tsv_path = tsv_path
        self.gene_type = gene_type
        self.feature = feature
        self.include_tpm_fpkm = include_tpm_fpkm
        self.transform = transform
        
        self.data = self.load_data()  # 加载并处理 GTF 文件数据
        self.tsv_data = self.load_tsv_data(tsv_path)  # 加载 TSV 文件数据
        self.fasta_file = pysam.FastaFile(self.fasta_path)  # 打开 FASTA 文件
        self.bigwig_file = pyBigWig.open(self.bigwig_path)  # 打开 BigWig 文件

    def load_data(self):
        column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
        df = pd.read_csv(self.gtf_path, sep='\t', comment='#', names=column_names)
        df = df[df['feature'] == self.feature]
        df['gene_type'] = df['attribute'].str.extract('gene_type "([^"]+)"')
        df['gene_id'] = df['attribute'].str.extract('gene_id "([^"]+)"')
        df['gene_name'] = df['attribute'].str.extract('gene_name "([^"]+)"')
        df = df[df['gene_type'] == self.gene_type]
        return df
    
    def load_tsv_data(self, tsv_path):
        tsv_data = pd.read_csv(tsv_path, sep='\t')  # 读取 TSV 文件
        return tsv_data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.data.iloc[idx]
        if self.include_tpm_fpkm:
            gene_id = sample['gene_id']
            tpm_fpkm_values = self.tsv_data[self.tsv_data['gene_id'] == gene_id]
            sample = sample.append(tpm_fpkm_values.iloc[0])
        if self.transform:
            sample = self.transform(sample)
        return sample
    
    def __del__(self):
        self.fasta_file.close()
        self.bigwig_file.close()

if __name__ == "__main__":
    # 主程序入口，当脚本作为主程序运行时执行以下代码

    # 设置 GTF 文件路径（需要替换为实际路径）
    gtf_path = '/data/haocheng/data/DNA/Homo_sapiens.GRCh38.104.gtf'

    # 设置 FASTA 文件路径（需要替换为实际路径）
    fasta_path = '/data/haocheng/data/DNA/Homo_sapiens.GRCh38.dna.primary_assembly.fa'

    # 设置 BigWig 文件路径（需要替换为实际路径）
    bigwig_path = '/data/haocheng/data/bam/result/GM12878.bigwig'

    # 设置 TSV 文件路径（需要替换为实际路径）
    tsv_path = '/data/haocheng/data/gene_expressiom/ENCFF345SHY.tsv'

    # 自定义基因类型，过滤特定类型的基因（例如 'protein_coding'）
    gene_type = 'protein_coding'  # 自定义基因类型

    # 自定义特征类型，过滤特定特征（例如 'gene'）
    feature = 'gene'  # 自定义特征

    # 指定是否包含 TPM 值
    include_tpm_fpkm = True  # 包含 TPM 值

    # 创建 BigWigDataset 数据集实例，传入各项参数
    dataset = BigWigDataset(gtf_path=gtf_path, fasta_path=fasta_path, bigwig_path=bigwig_path,
                            tsv_path=tsv_path, gene_type=gene_type, feature=feature,
                            include_tpm_fpkm=include_tpm_fpkm)

    # 创建 DataLoader 实例，用于批量加载数据
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)  # 设置批量大小为4，并且随机打乱数据

    # 初始化空列表，用于存储不同批次的数据
    signals = []  # 存储 BigWig 信号的列表
    tpm_values = []  # 存储 transformed TPM 值的列表

    # 遍历 DataLoader 加载的数据批次
    for batch in dataloader:
        signals.append(batch['signal'])  # 存储信号的张量
        tpm_values.append(batch['tpm'])  # 存储 TPM 的张量


  df = pd.read_csv(self.gtf_path, sep='\t', comment='#', names=column_names)


ValueError: num_samples should be a positive integer value, but got num_samples=0