In [7]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE  # Use SMOTE for handling imbalance
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV  # Use GridSearch for hyperparameter tuning
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline  # Use Pipeline for better code structure

In [8]:
# 读取数据集
df_train = pd.read_csv("train_10000.csv")
df_val = pd.read_csv("validate_1000.csv")

In [9]:
# 缺失值处理
df_train = df_train.fillna(df_train.mean())
df_val = df_val.fillna(df_val.mean())

In [10]:
# 切分数据集
X_train = np.array(df_train.drop(["label", "sample_id"], axis=1))
y_train = np.array(df_train["label"])

X_val = np.array(df_val.drop(["label", "sample_id"], axis=1))
y_val = np.array(df_val["label"])

In [11]:
# 标准化数据
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

In [12]:
# 过采样数据
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [13]:
# Hyperparameters for GridSearch
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}

In [14]:
# Use GridSearch for hyperparameter tuning
clf = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)

In [15]:
# Create a pipeline for better code structure
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', clf)
])

In [None]:
# Train the model
pipeline.fit(X_train_res, y_train_res)

In [None]:
# Evaluate the model
print(classification_report(y_val, pipeline.predict(X_val_scaled)))