In [None]:
# 导入需要的包
# Import the required packages.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression

In [None]:
# 数据读取 input data
train_dir = "/bohr/dataset-sxb8/v1/train.csv"
df_train = pd.read_csv(train_dir)

In [None]:
# 数据预处理与数据嵌入；Data Preprocessing and Data Embedding
# 由于该过程在预测集上也需要进行，此处将其整理为函数；Since this process also needs to be performed on the prediction set, it is organized into a function here.
# 选手可充分考虑不同数据嵌入技术，以提高预测效果；Participants are encouraged to fully consider different data embedding techniques to improve prediction performance.

def prepare_data(df):
    seq0 = np.array([list(x.lower()) for x in df.iloc[:,0].tolist()])
    encoder = OrdinalEncoder(dtype=np.int32)
    encoder.fit(seq0.reshape((-1,1)))
    x = encoder.transform(seq0.reshape((-1, 1))).reshape(seq0.shape)
    return x

x_train = prepare_data(df_train)
y_train = df_train['E-score']

In [None]:
# 模型训练
# 此处选取线性回归模型
# 选手可充分考虑不同机器学习/深度学习模型，以提高预测效果
# Model training here, the linear regression model is selected.
# Participants are encouraged to fully consider different machine learning/deep learning models to improve prediction performance.
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

In [None]:
import os
import zipfile
# 模型预测, Model Prediction
# 将连续值转化为01标签，Convert continuous values into 0-1 labels.

def make_label(y, per=99):
    threshold = np.percentile(y, per)
    labels = np.where(y >= threshold, 1, 0)
    return labels
# 读取测试集数据，Read test set data.
if os.environ.get('DATA_PATH'):
        DATA_PATH = os.environ.get("DATA_PATH") + "/"
else:
    print("Baseline运行时，因为无法读取测试集，所以会有此条报错，属于正常现象")
    print("When baseline is running, this error message will appear because the test set cannot be read, which is a normal phenomenon.")
    #Baseline运行时，因为无法读取测试集，所以会有此条报错，属于正常现象
    #When baseline is running, this error message will appear because the test set cannot be read, which is a normal phenomenon.
testA_path = DATA_PATH + "testA.csv"  #读取测试集A, read testing setA
df_testA = pd.read_csv(testA_path)
testB_path = DATA_PATH + "testB.csv" #读取测试集B,read teseting setB
df_testB = pd.read_csv(testB_path)
# A榜
x_testA = prepare_data(df_testA)
y_predA = make_label(linear_model.predict(x_testA))
pd.DataFrame(y_predA).to_csv("submissionA.csv", header = False, index = False)
# B榜
x_testB = prepare_data(df_testB)
y_predB = make_label(linear_model.predict(x_testB))
pd.DataFrame(y_predB).to_csv("submissionB.csv", header = False, index = False)

In [None]:
# 定义要打包的文件和压缩文件名，Define the files to be packaged and the compressed file name.
files_to_zip = ['submissionA.csv', 'submissionB.csv']
zip_filename = 'submission.zip'

# 创建一个 zip 文件，Create a zip file.
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files_to_zip:
        # 将文件添加到 zip 文件中，Add files to the zip file.
        zipf.write(file, os.path.basename(file))

print(f'{zip_filename} is created succefully!')