支持向量机和决策树模型相对较优

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

# 加载数据
data_path = 'E:\\C\\数据工程\\fraudulent.csv'
df = pd.read_csv(data_path)

# 处理缺失值：使用众数填充
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 划分特征和目标变量
X = df_imputed.drop('y', axis=1)
y = df_imputed['y']

# 划分训练集和测试集（随机种子设置为1）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义SVM模型和参数网格
svm = SVC(probability=True)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# 交叉验证探索最优参数
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 输出最优参数
print("最优参数：", grid_search.best_params_)

# 使用最优参数训练模型
best_svm = grid_search.best_estimator_

# 在测试集上评估模型
y_pred = best_svm.predict(X_test_scaled)
f1 = f1_score(y_test, y_pred)
print("测试集上的F1值：", f1)

最优参数： {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
测试集上的F1值： 0.8632411067193676


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer

# 1. 加载数据
data = pd.read_csv('E:\\C\\数据工程\\fraudulent.csv')

# 2. 处理缺失值
# 计算每个特征的众数
mode_values = data.mode().iloc[0]

# 使用SimpleImputer进行众数填充
imputer = SimpleImputer(strategy='most_frequent', fill_value=mode_values)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 3. 划分数据集
X = data_imputed.drop('y', axis=1)
y = data_imputed['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 4. 建立模型并通过交叉验证探索最优参数
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 输出最优参数
print("最优参数:", grid_search.best_params_)

# 5. 训练模型
best_model = grid_search.best_estimator_

# 6. 评估模型
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("测试集F1值:", f1)

最优参数: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
测试集F1值: 0.8660287081339713
