In [261]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import KMeansSMOTE
from sklearn.metrics import f1_score, classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier


In [262]:
def train_preprocess(data):
    # 去掉重复值
    data.drop_duplicates(inplace=True)
    # 箱线图去掉异常值
    # threshold = 1.5
    # Q1 = data.quantile(0.25)
    # Q3 = data.quantile(0.75)
    # IQR = Q3 - Q1
    # data_cleaned = data[~((data < (Q1 - threshold * IQR)) | (data > (Q3 + threshold * IQR))).any(axis=1)]
    # 填充缺失值
    # imputer = IterativeImputer(random_state=0)
    imputer = KNNImputer(n_neighbors=3)

    # data_imputed = imputer.fit_transform(data_cleaned)
    data_imputed = imputer.fit_transform(data)
    data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
    data_imputed = data_imputed.dropna()
    y = data_imputed['label']
    columns_to_drop = ['label']
    data_imputed = data_imputed.drop(columns_to_drop, axis=1)

    # 标准化
    scaler = StandardScaler()
    data_standerd = scaler.fit_transform(data_imputed)

    return data_standerd, y

In [263]:
def val_preprocess(data):
    # 去掉重复值
    data.drop_duplicates(inplace=True)

    # 填充缺失值
    # imputer = IterativeImputer(random_state=0)
    imputer = KNNImputer(n_neighbors=3)
    data_imputed = imputer.fit_transform(data)
    data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
    data_imputed = data_imputed.dropna()
    y = data_imputed['label']
    columns_to_drop = ['label']
    data_imputed = data_imputed.drop(columns_to_drop, axis=1)

    # 标准化
    scaler = StandardScaler()
    data_standerd = scaler.fit_transform(data_imputed)

    return data_standerd, y

In [264]:
# 读取数据集
df_train = pd.read_csv("./train_10000.csv")
df_val = pd.read_csv("./validate_1000.csv")

In [265]:
mean1 = df_train.mean()
mean2 = df_val.mean()
columns_to_drop = mean1[abs(mean1 - mean2) > 1000].index
df_train = df_train.drop(columns_to_drop, axis=1)
df_val = df_val.drop(columns_to_drop, axis=1)

In [266]:
X_train, y_train = train_preprocess(df_train)
X_val, y_val = val_preprocess(df_val)

In [267]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(6296, 57)
(6296,)
(1000, 57)
(1000,)


In [250]:
# 过采样数据
kmeans_smote = KMeansSMOTE(cluster_balance_threshold=0.064, random_state=42)
X_train, y_train = kmeans_smote.fit_resample(X_train, y_train)

In [268]:
catboost_clf = CatBoostClassifier()
gradientboost_clf = GradientBoostingClassifier()
svm_clf = SVC()

In [269]:
# 构建集成模型
ensemble_model = VotingClassifier(estimators=[
    ('catboost', catboost_clf),
    ('gradientboost', gradientboost_clf),
    ('svm', svm_clf)
], voting='hard')

# 使用集成模型进行训练和预测
ensemble_model.fit(X_train, y_train)

Learning rate set to 0.086911
0:	learn: 1.5735478	total: 67ms	remaining: 1m 6s
1:	learn: 1.4360685	total: 98.5ms	remaining: 49.2s
2:	learn: 1.3091317	total: 126ms	remaining: 41.8s
3:	learn: 1.2090878	total: 151ms	remaining: 37.7s
4:	learn: 1.1258869	total: 176ms	remaining: 35s
5:	learn: 1.0510529	total: 200ms	remaining: 33.2s
6:	learn: 0.9946231	total: 224ms	remaining: 31.8s
7:	learn: 0.9430902	total: 249ms	remaining: 30.9s
8:	learn: 0.8962815	total: 273ms	remaining: 30.1s
9:	learn: 0.8544789	total: 297ms	remaining: 29.4s
10:	learn: 0.8181970	total: 320ms	remaining: 28.8s
11:	learn: 0.7851952	total: 346ms	remaining: 28.5s
12:	learn: 0.7546485	total: 370ms	remaining: 28.1s
13:	learn: 0.7310524	total: 396ms	remaining: 27.9s
14:	learn: 0.7080409	total: 421ms	remaining: 27.6s
15:	learn: 0.6839399	total: 445ms	remaining: 27.3s
16:	learn: 0.6635938	total: 471ms	remaining: 27.2s
17:	learn: 0.6453068	total: 498ms	remaining: 27.2s
18:	learn: 0.6281264	total: 532ms	remaining: 27.5s
19:	learn: 0.

In [274]:
y_val_pred = ensemble_model.predict(X_val)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (3, 1000) + inhomogeneous part.

In [None]:
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(macro_f1)
print(classification_report(y_val, y_val_pred))