In [1]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import KMeansSMOTE
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.ensemble import VotingClassifier

In [2]:
# 创建一个包装器类，对CatBoostClassifier进行包装
class CatBoostClassifierWrapper(cb.CatBoostClassifier):
    def predict(self, X):
        return super().predict(X).ravel()

In [3]:
# 读取数据集
df_train = pd.read_csv("train_10000.csv")
df_val = pd.read_csv("validate_1000.csv")

In [4]:
# 缺失值处理
df_train = df_train.fillna(df_train.mean())
df_val = df_val.fillna(df_val.mean())

In [5]:
# 切分数据集
X_train = np.array(df_train.drop(["label", "sample_id"], axis=1))
y_train = np.array(df_train["label"])

X_val = np.array(df_val.drop(["label", "sample_id"], axis=1))
y_val = np.array(df_val["label"])

In [6]:
# 标准化数据
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

In [7]:
# 过采样数据
kmeans_smote = KMeansSMOTE(cluster_balance_threshold=0.064, random_state=42)
X_train_resampled, y_train_resampled = kmeans_smote.fit_resample(X_train_scaled, y_train)



In [8]:
# 创建模型
catboost_model = CatBoostClassifierWrapper(random_seed=42, verbose=False)
gradientboost_model = GradientBoostingClassifier(random_state=42)
svm_model = svm.SVC(random_state=42)

voting_model = VotingClassifier(estimators=[
    ('cb', catboost_model),
    ('gb', gradientboost_model),
    ('svm', svm_model)],
    voting='hard')

In [9]:
voting_model = voting_model.fit(X_train_resampled, y_train_resampled)

In [10]:
y_val_pred = voting_model.predict(X_val_scaled)
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(macro_f1)
print(classification_report(y_val, y_val_pred))

0.7907462426002828
              precision    recall  f1-score   support

           0       0.51      0.84      0.63       176
           1       0.72      0.50      0.59       166
           2       0.75      0.77      0.76       171
           3       1.00      0.78      0.88       169
           4       0.99      0.99      0.99       156
           5       1.00      0.81      0.90       162

    accuracy                           0.78      1000
   macro avg       0.83      0.78      0.79      1000
weighted avg       0.82      0.78      0.79      1000



In [11]:
y_train_pred = voting_model.predict(X_train_scaled)
macro_f1 = f1_score(y_train, y_train_pred, average='macro')
print(macro_f1)
print(classification_report(y_train, y_train_pred))

0.9546642503442776
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5144
           1       1.00      0.81      0.90      1062
           2       0.87      0.95      0.91      1613
           3       0.98      0.98      0.98       884
           4       1.00      0.99      1.00       554
           5       1.00      0.94      0.97       743

    accuracy                           0.96     10000
   macro avg       0.97      0.94      0.95     10000
weighted avg       0.96      0.96      0.96     10000

