In [1]:
import pandas as pd
import numpy as np
import pyBigWig

# 文件路径
gtf_file = "/data/haocheng/data/DNA/Homo_sapiens.GRCh38.104.gtf"
bigwig_file = "/data/haocheng/data/bam/result/GM12878.bigwig"
expression_file = "/data/haocheng/data/gene_expressiom/ENCFF345SHY.tsv"

In [2]:
genes = []

# 允许使用不带 'chr' 前缀的染色体名称
valid_chromosomes = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY'] + [str(i) for i in range(1, 23)] + ['X', 'Y']

with open(gtf_file, 'r') as f:
    for line in f:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] == 'gene':
            info = {x.split(' ')[0]: x.split(' ')[1].strip('";') for x in fields[8].split('; ')}
            if info.get('gene_biotype') == 'protein_coding':
                chrom = fields[0]
                if chrom in valid_chromosomes:
                    tss = int(fields[3])  # TSS是起始位置
                    start = max(0, tss - 10000)  # 向前500个碱基
                    end = tss + 10000  # 向后500个碱基
                    gene_id = info['gene_id']
                    genes.append([chrom, start, end, gene_id])
                #else:
                  # print(f"Skipped non-chromosomal gene: {info['gene_id']} in {chrom}")  # 仅打印非染色体基因的基因ID

genes_df = pd.DataFrame(genes, columns=['chrom', 'start', 'end', 'gene_id'])

# 仅在没有找到基因时打印
if genes_df.empty:
    print("No genes found.")
else:
    print(f"Total genes found: {len(genes_df)}")

Total genes found: 19924


In [3]:
# 查看 GTF 文件的前 5 行
with open(gtf_file, 'r') as f:
    for i in range(5):  # 打印前 5 行
        print(f.readline().strip())
print(genes_df)


#!genome-build GRCh38.p13
#!genome-version GRCh38
#!genome-date 2013-12
#!genome-build-accession GCA_000001405.28
#!genebuild-last-updated 2021-03
      chrom     start       end          gene_id
0         1    675679    695679  ENSG00000284662
1         1   1201340   1221340  ENSG00000186827
2         1   1193508   1213508  ENSG00000186891
3         1   1461765   1481765  ENSG00000160072
4         1   6614866   6634866  ENSG00000041988
...     ...       ...       ...              ...
19919    21  34503142  34523142  ENSG00000159200
19920    21  36146782  36166782  ENSG00000142197
19921    21  15719982  15739982  ENSG00000155313
19922    21   6489203   6509203  ENSG00000276076
19923    21  31108416  31128416  ENSG00000156299

[19924 rows x 4 columns]


In [4]:
# 打开 BigWig 文件
bw = pyBigWig.open(bigwig_file)

# 读取 BigWig 数据并添加到 DataFrame
def get_bigwig_values(row):
    chrom = f'chr{row["chrom"]}'  # 将数字染色体转换为带 "chr" 前缀的格式
    start = row['start']
    end = row['end']
    
    # 获取 BigWig 信号值，返回长度为 1000 的值
    values = bw.values(chrom, start, end)
    
    # 如果 BigWig 返回的值数量不足 1000，填充 0
    if len(values) < 1000:
        values.extend([0] * (1000 - len(values)))
    
    return values # 确保返回的是 1000 个值

# 添加新列到 DataFrame
genes_df['bigwig_values'] = genes_df.apply(get_bigwig_values, axis=1)

# 关闭 BigWig 文件
bw.close()


In [1]:
print(genes_df)

NameError: name 'genes_df' is not defined

In [6]:
# 读取 TSV 文件
expression_file = "/data/haocheng/data/gene_expressiom/ENCFF345SHY.tsv"
expression_df = pd.read_csv(expression_file, sep='\t')

# 去掉 gene_id 中的版本号后缀
expression_df['gene_id'] = expression_df['gene_id'].str.split('.').str[0]

# 计算 log(TPM + 1)
expression_df['log_TPM'] = np.log1p(expression_df['TPM'])

# 进行合并
merged_df = genes_df.merge(expression_df[['gene_id', 'log_TPM']], on='gene_id', how='inner')


In [7]:
# 去掉 expression_df 中 gene_id 的小数点及其后面的部分
expression_df['gene_id'] = expression_df['gene_id'].str.split('.').str[0]

# 找到两个 DataFrame 中都存在的 gene_id
common_ids = set(genes_df['gene_id']).intersection(set(expression_df['gene_id']))

# 打印这些共同的 gene_id 及其数量
print(f"共同存在的 gene_id 数量: {len(common_ids)}")


共同存在的 gene_id 数量: 19807


In [8]:
print(merged_df)

      chrom     start       end          gene_id  \
0         1    675679    695679  ENSG00000284662   
1         1   1201340   1221340  ENSG00000186827   
2         1   1193508   1213508  ENSG00000186891   
3         1   1461765   1481765  ENSG00000160072   
4         1   6614866   6634866  ENSG00000041988   
...     ...       ...       ...              ...   
19821    21  34503142  34523142  ENSG00000159200   
19822    21  36146782  36166782  ENSG00000142197   
19823    21  15719982  15739982  ENSG00000155313   
19824    21   6489203   6509203  ENSG00000276076   
19825    21  31108416  31128416  ENSG00000156299   

                                           bigwig_values   log_TPM  
0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  0.058269  
1      [1349.0, 1349.0, 1349.0, 1349.0, 1349.0, 1349....  1.261298  
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  1.214913  
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  2.730464  
4      [160.0, 160.0, 160.0, 1

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
from keras.callbacks import Callback
import tensorflow as tf


In [9]:
# 假设 merged_df 是你之前合并后的数据
# 选择要作为测试集的染色体
test_chromosomes = ['13', '14', '15']  # 你可以根据需要修改这个列表

# 划分训练集和测试集
test_df = merged_df[merged_df['chrom'].isin(test_chromosomes)]
train_df = merged_df[~merged_df['chrom'].isin(test_chromosomes)]

# 打印结果
print("训练集的大小:", train_df.shape)
print("测试集的大小:", test_df.shape)
# 检查训练集和测试集中的非数值（non）值数量
non_train = train_df.isnull().sum().sum()  # 训练集中的非数值值总数
non_test = test_df.isnull().sum().sum()  # 测试集中的非数值值总数

print(f"Training set non values: {non_train}")
print(f"Testing set non values: {non_test}")
# 检查是否有无穷大或负值
#print(np.isinf(X_train_scaled).sum())  # 检查无穷大值
#print(np.isfinite(X_train_scaled).sum())  # 检查有限值


训练集的大小: (18301, 6)
测试集的大小: (1525, 6)
Training set non values: 0
Testing set non values: 0


In [None]:
import matplotlib.pyplot as plt

plt.hist(train_df['bigwig_values'], bins=50)
plt.title("Distribution of BigWig Values")
plt.xlabel("BigWig Values")
plt.ylabel("Frequency")
plt.show()

plt.hist(train_df['log_TPM'], bins=50)
plt.title("Distribution of log(TPM) Values")
plt.xlabel("log(TPM) Values")
plt.ylabel("Frequency")
plt.show()


In [None]:

# 自定义回调类，用于在每个周期结束时打印评估指标
class CustomMetrics(Callback):
    def __init__(self, X_test, y_test):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, y_pred)
        mae = mean_absolute_error(self.y_test, y_pred)
        r2 = r2_score(self.y_test, y_pred)
        pearson_corr, _ = pearsonr(self.y_test, y_pred.flatten())
        
        print(f"Epoch {epoch + 1}: MSE = {mse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}, Pearson Correlation = {pearson_corr:.4f}")

# 假设 train_df 和 test_df 是你已经划分好的训练集和测试集

# 数据预处理
X_train = np.array(train_df['bigwig_values'].tolist())  # 特征：BigWig 信号值
y_train = train_df['log_TPM'].values  # 目标：log(TPM)
X_test = np.array(test_df['bigwig_values'].tolist())  # 测试集特征
y_test = test_df['log_TPM'].values  # 测试集目标

# 标准化输入特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # 对训练集进行标准化
X_test_scaled = scaler.transform(X_test)  # 对测试集进行同样的标准化

# Reshape 输入数据以符合 LSTM 的要求
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# 构建 LSTM 模型
model = Sequential()
model.add(LSTM(100, activation='tanh', input_shape=(X_train_scaled.shape[1], 1), return_sequences=True))  # 第一层 LSTM
model.add(Dropout(0.2))
model.add(LSTM(50, activation='tanh'))  # 第二层 LSTM
model.add(Dropout(0.2))
model.add(Dense(1))  # 输出层

from tensorflow.keras.optimizers import Adam

# 设置更小的学习率
optimizer = Adam(learning_rate=0.0001)  # 调整学习率
model.compile(optimizer = Adam(learning_rate=0.001, clipnorm=1.0), loss='mse')  # 使用 Adam 优化器和均方误差作为损失函数

# 训练模型，使用自定义回调
custom_metrics = CustomMetrics(X_test_scaled, y_test)
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[custom_metrics])  

In [31]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Dense, Add, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 自定义回调类，用于在每个周期结束时打印评估指标
class CustomMetrics(Callback):
    def __init__(self, X_test, y_test, threshold=0.1):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, y_pred)
        mae = mean_absolute_error(self.y_test, y_pred)
        r2 = r2_score(self.y_test, y_pred)
        pearson_corr, _ = pearsonr(self.y_test, y_pred.flatten())
        
        accuracy = np.mean(np.abs(y_pred.flatten() - self.y_test) < self.threshold)
        
        print(f"Epoch {epoch + 1}: MSE = {mse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}, Pearson Correlation = {pearson_corr:.4f}, Accuracy = {accuracy:.4f}")

# 数据预处理
X_train = np.array(train_df['bigwig_values'].tolist())
y_train = train_df['log_TPM'].values
X_test = np.array(test_df['bigwig_values'].tolist())
y_test = test_df['log_TPM'].values

# 标准化输入特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape 输入数据
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# 划分训练集和验证集
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# 构建 CNN 模型
input_layer = Input(shape=(X_train_final.shape[1], 1))

# 第一层卷积
conv1 = Conv1D(64, kernel_size=3, activation='relu', padding='same')(input_layer)
conv1 = Dropout(0.3)(conv1)

# 第二层卷积
conv2 = Conv1D(128, kernel_size=3, activation='relu', padding='same')(conv1)
conv2 = Dropout(0.3)(conv2)

# 第三层卷积
conv3 = Conv1D(256, kernel_size=3, activation='relu', padding='same')(conv2)
conv3 = Dropout(0.3)(conv3)

# 平铺并添加全连接层
flatten = Flatten()(conv3)
dense = Dense(50, activation='relu')(flatten)
output_layer = Dense(1)(dense)  # 输出层

# 构建模型
model = Model(inputs=input_layer, outputs=output_layer)

# 设置优化器
optimizer = Adam(learning_rate=0.00001)  # 更低的学习率
model.compile(optimizer=optimizer, loss='mse')  # 使用均方误差作为损失函数

# 训练模型，使用自定义回调
custom_metrics = CustomMetrics(X_test_scaled, y_test)
model.fit(X_train_final, y_train_final, epochs=100, batch_size=32, validation_data=(X_val, y_val), callbacks=[custom_metrics])


Epoch 1/100
Epoch 1: MSE = 2.4315, MAE = 1.2146, R² = -0.1177, Pearson Correlation = 0.2793, Accuracy = 0.0249
Epoch 2/100
Epoch 2: MSE = 2.2365, MAE = 1.1710, R² = -0.0280, Pearson Correlation = 0.3015, Accuracy = 0.0249
Epoch 3/100
Epoch 3: MSE = 2.2724, MAE = 1.1843, R² = -0.0446, Pearson Correlation = 0.2831, Accuracy = 0.0308
Epoch 4/100
Epoch 4: MSE = 2.2159, MAE = 1.1764, R² = -0.0186, Pearson Correlation = 0.2912, Accuracy = 0.0302
Epoch 5/100
Epoch 5: MSE = 2.3001, MAE = 1.1771, R² = -0.0573, Pearson Correlation = 0.2914, Accuracy = 0.0308
Epoch 6/100
Epoch 6: MSE = 2.3730, MAE = 1.1794, R² = -0.0908, Pearson Correlation = 0.2991, Accuracy = 0.0374
Epoch 7/100

KeyboardInterrupt: 