In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import sklearn.svm as svm
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import itertools
import xgboost as xgb
import shap

In [None]:
# 导入数据
df = pd.read_csv('./process_data/train.csv')
keys = np.array(df.columns)
dic = {}
for colname in df:
    arr = df.loc[:,colname].values
    dic[colname] = arr

# 构建数据集与标签集
data_set = dic[keys[1]].reshape(-1,1)   # 数据集
for i in range(2,len(keys)-1):
    data_set = np.append(data_set, dic[keys[i]].reshape(-1,1), axis = 1)
label_set = dic[keys[-1]]               # 标签集

In [None]:
data_set = data_set[np.lexsort(-data_set.T[0, None])]
A_idx = np.argwhere(data_set[:,0] > np.median(data_set[:,0]))
B_idx = np.argwhere(data_set[:,0] <= np.median(data_set[:,0]))

In [None]:
# 取A_idx为训练集，B_idx为测试集（7：3）
x_train = data_set[A_idx]
y_train = label_set[A_idx]
x_, x_test, y_, y_test = train_test_split(data_set[B_idx], label_set[B_idx], test_size=0.3)

# 数据预处理
# 分别对训练集、验证集和测试集进行数据标准化
scaler = StandardScaler()     
x_train = scaler.fit_transform(x_train.squeeze())     # 完整训练集
x_test = scaler.fit_transform(x_test.squeeze())       # 测试集

In [None]:
# 模型训练
model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=160, objective='binary:logistic')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# 绘制预测结果
cm = confusion_matrix(y_test, y_pred)
labels_name = ['popular','unpopular']
plot_confusion_matrix(cm, labels_name, "Confusion Matrix Model 1")
plt.show()