In [2]:
# 导入需要的包
# Import the required packages.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
# 数据读取 input data
train_dir = "/bohr/dataset-sxb8/v1/train.csv"
# train_dir = "train.csv"
df_train = pd.read_csv(train_dir)

In [5]:
# 数据预处理与数据嵌入；Data Preprocessing and Data Embedding
# 由于该过程在预测集上也需要进行，此处将其整理为函数；Since this process also needs to be performed on the prediction set, it is organized into a function here.
# 选手可充分考虑不同数据嵌入技术，以提高预测效果；Participants are encouraged to fully consider different data embedding techniques to improve prediction performance.

def prepare_data(df):
    for i in range(df.shape[0]):
        seg = df['DNA'][i]
        idx_l = ['', 'A', 'C', 'G', 'T']
        seg_l = []
        for n in seg:
            seg_l.append(idx_l.index(n))
        dna.append(seg_l)

    dna_t = torch.tensor(dna, dtype=torch.float32)
    return dna_t

In [6]:
# 模型训练
# 此处选取线性回归模型
# 选手可充分考虑不同机器学习/深度学习模型，以提高预测效果
# Model training here, the linear regression model is selected.
# Participants are encouraged to fully consider different machine learning/deep learning models to improve prediction performance.
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dna, e_score):
        super().__init__()
        self.dna = dna
        self.e_score = e_score

    def __len__(self):
        return len(self.dna)

    def __getitem__(self, idx):
        return self.dna[idx], self.e_score[idx]


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(8, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 16),
            nn.Tanh(),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        return self.net(x)


dna = []
e_scores = []

for i in range(df_train.shape[0]):
    seg = df_train['DNA'][i]
    e_scores.append(df_train['E-score'][i])
    idx_l = ['', 'A', 'C', 'G', 'T']
    seg_l = []
    for n in seg:
        seg_l.append(idx_l.index(n))
    dna.append(seg_l)

dna_t = torch.tensor(dna, dtype=torch.float32)
e_scores_t = torch.tensor(e_scores, dtype=torch.float32)

dataset = MyDataset(dna_t, e_scores_t)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

NUM_EPOCHS = 500
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(NUM_EPOCHS):
    for dna_seg, e_score_seg in dataloader:
        optimizer.zero_grad()
        pred = net(dna_seg)
        loss = criterion(pred.T[0], e_score_seg)
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        with torch.no_grad():
            pred = net(dna_t)
            loss = criterion(pred.T[0], e_scores_t)
            print(f"Epoch {epoch}, loss = {loss}")

Epoch 0, loss = 0.01857765018939972
Epoch 10, loss = 0.013916683383286
Epoch 20, loss = 0.012102792039513588
Epoch 30, loss = 0.010693647898733616
Epoch 40, loss = 0.00965876504778862
Epoch 50, loss = 0.009177572093904018
Epoch 60, loss = 0.008699782192707062
Epoch 70, loss = 0.008557253517210484
Epoch 80, loss = 0.008097860030829906
Epoch 90, loss = 0.007916121743619442
Epoch 100, loss = 0.00794250052422285
Epoch 110, loss = 0.0075547583401203156
Epoch 120, loss = 0.0075728450901806355
Epoch 130, loss = 0.007366225589066744
Epoch 140, loss = 0.007325200829654932
Epoch 150, loss = 0.007075557019561529
Epoch 160, loss = 0.007599414326250553
Epoch 170, loss = 0.007013965398073196
Epoch 180, loss = 0.007068799342960119
Epoch 190, loss = 0.007168845273554325
Epoch 200, loss = 0.006720400881022215
Epoch 210, loss = 0.006702834740281105
Epoch 220, loss = 0.006819236557930708
Epoch 230, loss = 0.006791173480451107
Epoch 240, loss = 0.006716417148709297
Epoch 250, loss = 0.0066254050470888615


In [None]:
import os
import zipfile
# 模型预测, Model Prediction
# 将连续值转化为01标签，Convert continuous values into 0-1 labels.

def make_label(y, per=95):
    y = y.detach().numpy()
    threshold = np.percentile(y, per)
    labels = np.where(y >= threshold, 1, 0)
    return labels
# 读取测试集数据，Read test set data.
if os.environ.get('DATA_PATH'):
        DATA_PATH = os.environ.get("DATA_PATH") + "/"
else:
    print("Baseline运行时，因为无法读取测试集，所以会有此条报错，属于正常现象")
    print("When baseline is running, this error message will appear because the test set cannot be read, which is a normal phenomenon.")
    #Baseline运行时，因为无法读取测试集，所以会有此条报错，属于正常现象
    #When baseline is running, this error message will appear because the test set cannot be read, which is a normal phenomenon.
testA_path = DATA_PATH + "testA.csv"  #读取测试集A, read testing setA
df_testA = pd.read_csv(testA_path)
testB_path = DATA_PATH + "testB.csv" #读取测试集B,read teseting setB
df_testB = pd.read_csv(testB_path)
# A榜
x_testA = prepare_data(df_testA)
y_predA = make_label(net(x_testA))
pd.DataFrame(y_predA).to_csv("submissionA.csv", header = False, index = False)
# B榜
x_testB = prepare_data(df_testB)
y_predB = make_label(net(x_testB))
pd.DataFrame(y_predB).to_csv("submissionB.csv", header = False, index = False)

In [None]:
# 定义要打包的文件和压缩文件名，Define the files to be packaged and the compressed file name.
files_to_zip = ['submissionA.csv', 'submissionB.csv']
zip_filename = 'submission.zip'

# 创建一个 zip 文件，Create a zip file.
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files_to_zip:
        # 将文件添加到 zip 文件中，Add files to the zip file.
        zipf.write(file, os.path.basename(file))

print(f'{zip_filename} is created succefully!')