# 数据处理

In [20]:
import pandas as pd

df_train = pd.read_csv('train_10000.csv')
df_test = pd.read_csv('validate_1000.csv')

## 清理数据

使用fillna填充NaN为平均值

In [21]:
df_train = df_train.groupby('label').apply(lambda x: x.fillna(x.mean()))

## 分割数据

In [22]:
from sklearn.model_selection import train_test_split

feature = df_train.drop(['sample_id', 'label'], axis=1)
label = df_train['label']
X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0)

ValueError: test_size=0 should be either positive and smaller than the number of samples 10000 or a float in the (0, 1) range

## 平衡数据集

合成少数类过采样技术（SMOTE）：在少数类中创建新的合成样本，以使其与多数类具有相同数量的样本。这种方法可以避免信息丢失和过拟合问题。使用imblearn库中的SMOTE类来执行此操作。

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

# 训练并验证

### macro_F1
Linear SVC:<0.2
KNN classifier:0.88
SVC:0.21
RFST:0.97
ADA:0.47

In [None]:
# from sklearn.svm import LinearSVC, SVC
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

# C = 1
# 创建不同的分类器
# classifiers = {
#      'Linear SVC': LinearSVC(C=C),
#      'KNN classifier': KNeighborsClassifier(C, n_jobs=-1),
#      'SVC': SVC(),
#      'RFST': RandomForestClassifier(n_jobs=-1),
#      'ADA': AdaBoostClassifier()
# }
clf = RandomForestClassifier(n_jobs=-1)

使用不同算法训练模型并输出报告

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# 训练
clf.fit(X_train, np.ravel(y_train))

# 推理验证
y_pred = clf.predict(X_test)
macro_F1 = f1_score(y_test, y_pred, average='macro')
print("macro_F1 is {}".format(macro_F1))
print(classification_report(y_test, y_pred))

# 保存模型

In [None]:
from skops.io import dump

dump(clf, "preproccess.skops")