In [20]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.inf)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

查看数据是否正确并保存标签文件

In [21]:
data = pd.read_csv(r'.\data\Rolling ring cutting40-70_predication.csv')
# Remove empty rows
data = data.dropna(how='all')
print("DataSet Size: ", len(data))
print(data)

DataSet Size:  386226
                                                 sequence  label
0                LGLGLGLGLGLGLGLRLRLRLRLRLRLRLRLRLRLRQWPR      0
1                LLRLLIGRSRRARLDAALRRGAAGLRWRAWRLWHARRLRR      0
2                WWLSEPWRLRRRRARIVRTPFPPAWRAILRRRVPVYRRLP      0
3                LLLGLALFAVAGGLLVYAAISLGWRVAVRLRRRRRLRRRM      0
4                LAWWRATQARWHAWVAAWPPWRALRARVAAWRRVWRRWRR      0
...                                                   ...    ...
386221           GDIDFQCEQFDRLYLGKLVVRPTPNPNFRRIDAQVRDVDG      0
386222  GFTLVEVLVALAVVAVTLGAGLQAAAALTRNAQRLGEVTEAQWCAD...      0
386223  ALAVVAVTLGAGLQAAAALTRNAQRLGEVTEAQWCADNQITNLRLP...      0
386224  DNQITNLRLPEPPAFPGVGDIDFQCEQFDRLYLGKLVVRPTPNPNF...      0
386225  PEPPAFPGVGDIDFQCEQFDRLYLGKLVVRPTPNPNFRRIDAQVRD...      0

[386226 rows x 2 columns]


In [None]:
# 获取标签
labels = data.iloc[:, 1].values  
# 将标签保存到文本文件中
label_file_path = 'save_labels.txt'
with open(label_file_path, 'w') as f:
    for label in labels:
        f.write(f"{label}\n")
print(f"Labels have been saved to {label_file_path}")

25维特征

In [22]:
# 理化性质
physicochemical_properties = {
    'A': [5, 0, 2, 7.00, 0.52], 'C': [6, 0, 2, 7.00, 0.29], 'D': [8, -1, 4, 3.65, -0.9], 'E': [9, -1, 4, 3.22, -0.74],
    'F': [11, 0, 2, 7.00, 1.19], 'G': [4, 0, 2, 7.00, 0.48], 'H': [10, 1, 4, 6.00, -0.4], 'I': [8, 0, 2, 7.00, 1.38],
    'K': [9, 1, 2, 10.53, -1.5], 'L': [8, 0, 2, 7.00, 1.06], 'M': [8, 0, 2, 7.00, 0.64], 'N': [8, 0, 4, 8.18, -0.78],
    'P': [7, 0, 2, 7.00, 0.12], 'Q': [9, 0, 4, 7.00, -0.85], 'R': [11, 1, 4, 12.48, -2.53], 'S': [6, 0, 4, 7.00, -0.18],
    'T': [7, 0, 4, 7.00, -0.05], 'V': [7, 0, 2, 7.00, 1.08], 'W': [14, 0, 3, 7.00, 0.81], 'Y': [12, 0, 3, 10.07, 0.26]
}
# 映射字典
amino_acid_map = {
    'A': 1, 'C': 2, 'D': 3, 'E': 4,
    'F': 5, 'G': 6, 'H': 7, 'I': 8,
    'K': 9, 'L': 10, 'M': 11, 'N': 12,
    'P': 13, 'Q': 14, 'R': 15, 'S': 16,
    'T': 17, 'V': 18, 'W': 19, 'Y': 20
}

独特编码函数

In [23]:
# 定义一个函数，将氨基酸序列转换为独热编码数组
def one_hot_encode_sequence(sequence, mapping):
    # 创建一个空列表来存储编码后的序列和理化性质
    encoded_sequence_with_properties = []
    # 遍历序列中的每个氨基酸
    for amino_acid in sequence:
        # 获取氨基酸的映射索引
        index = mapping[amino_acid] - 1
        # 创建一个长度为 20 的全零数组，只在对应索引位置设置为 1
        one_hot_vector = np.zeros(20)
        one_hot_vector[index] = 1
        # 获取对应的理化性质
        properties = np.array(physicochemical_properties[amino_acid])
        # 创建一个25元素的一维数组，前20个元素是独热编码，后5个元素是理化性质
        combined_vector = np.concatenate((one_hot_vector, properties))
        encoded_sequence_with_properties.append(combined_vector)
    # 将序列组合成一个 NumPy 数组
    encoded_array = np.array(encoded_sequence_with_properties)
    # 仅对每个序列的最后两个元素进行标准化
    scaler = StandardScaler()
    encoded_array[:, -5:] = scaler.fit_transform(encoded_array[:, -5:])
    return encoded_array


In [27]:
# 保存路径
Base_Save_Path = r'.\data\data38w'
# CSV 文件路径，包含待处理的氨基酸序列
excel_file_path = r'.\data\Rolling ring cutting40-70_predication.csv'

# 读取 CSV 文件中的数据
excel_data = pd.read_csv(excel_file_path)
# 用于统计未知氨基酸 'X', 'B', 'U' 的计数
N = [0]*3
# 定义每个文件夹中最多可以存放的文件数量
files_per_folder = 10000
# 当前文件夹索引，用于跟踪和命名新的文件夹
current_folder_index = 0
# 文件计数器，用于在达到 `files_per_folder` 时切换到新文件夹
file_counter = 0
# 遍历 CSV 文件中的每一行
for i in range(len(excel_data)):
    # 获取当前行的氨基酸序列
    Chain = excel_data.iloc[i, 0]
    # 检查序列中是否包含未知氨基酸 'X'
    if 'X' in Chain:
        N[0] += 1
        print(f"Skipping sequence with unknown amino acid 'X' at position {i+1}.")
        continue  # 跳过包含 'X' 的序列
    # 检查序列中是否包含未知氨基酸 'B'
    if 'B' in Chain:
        N[1] += 1
        print(f"Skipping sequence with unknown amino acid 'B' at position {i+1}.")
        continue  # 跳过包含 'B' 的序列
    # 检查序列中是否包含未知氨基酸 'U'
    if 'U' in Chain:
        N[2] += 1
        print(f"Skipping sequence with unknown amino acid 'U' at position {i+1}.")
        continue  # 跳过包含 'U' 的序列
    # 检查当前文件数是否达到每个文件夹的限制，若是则创建新文件夹
    if file_counter % files_per_folder == 0:
        current_folder_index += 1  # 增加文件夹索引
        current_folder_path = os.path.join(Base_Save_Path, f'folder_{current_folder_index}')
        os.makedirs(current_folder_path, exist_ok=True)  # 创建新文件夹，如果不存在
    # 使用 Excel 行索引作为文件名
    file_name = str(i + 1)
    # 保存路径，包括当前文件夹路径和文件名
    Save_Path = os.path.join(current_folder_path, f'{file_name}.txt')
    # 将氨基酸序列转换为独热编码和理化性质的矩阵
    Matrix = one_hot_encode_sequence(Chain, amino_acid_map)
    # 打开目标文件，写入矩阵数据
    with open(Save_Path, 'w') as f:
        for row in Matrix:
            # 将矩阵中的每一行写入文件，每个值用空格分隔
            f.write(' '.join(map(str, row)) + '\n')
    # 增加文件计数器
    file_counter += 1
# 所有序列处理完毕后，打印结束信息
print("Over!")

Skipping sequence with unknown amino acid 'U' at position 330524.
Skipping sequence with unknown amino acid 'U' at position 330525.
Skipping sequence with unknown amino acid 'U' at position 330526.
Skipping sequence with unknown amino acid 'U' at position 330527.
Skipping sequence with unknown amino acid 'U' at position 330528.
Skipping sequence with unknown amino acid 'U' at position 330616.
Skipping sequence with unknown amino acid 'U' at position 330617.
Skipping sequence with unknown amino acid 'U' at position 330618.
Skipping sequence with unknown amino acid 'U' at position 330619.
Skipping sequence with unknown amino acid 'U' at position 330620.
Skipping sequence with unknown amino acid 'U' at position 330694.
Skipping sequence with unknown amino acid 'U' at position 330695.
Skipping sequence with unknown amino acid 'U' at position 330696.
Skipping sequence with unknown amino acid 'U' at position 330697.
Skipping sequence with unknown amino acid 'U' at position 330750.
Skipping s