In [7]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,train_test_split
from scipy.stats import uniform, randint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

ImportError: cannot import name 'train_test_split' from 'sklearn.preprocessing' (/home/flyingbucket/anaconda3/envs/xgb_env/lib/python3.10/site-packages/sklearn/preprocessing/__init__.py)

In [41]:

def xgb_clf_search(X_train, y_train, n_iter=100):

    param_dist = {
        "n_estimators": randint(100, 300),
        "learning_rate": uniform(0.01, 0.3),
        "max_depth": randint(3, 10),
        "min_child_weight": randint(1, 10),
        "gamma": uniform(0, 0.5),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "reg_alpha": uniform(0, 1),  # 注意参数名不同
        "reg_lambda": uniform(0, 1),
    }

    model = XGBClassifier(
        objective="binary:logistic",
        n_jobs=-1,
        eval_metric="mlogloss",
        random_state=42,
    )

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring="accuracy",
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42,
        return_train_score=True,
    )

    random_search.fit(X_train, y_train)

    best_params = random_search.best_params_
    best_score = random_search.best_score_

    print("Best Parameters:")
    for key, value in best_params.items():
        print(f"{key}: {value}")
    print("Best Accuracy:", best_score)

    return best_params, best_score 

In [42]:
total_labeled=pd.read_csv("data/total_labeled.csv")
total_labeled.head()
total_labeled["income"].value_counts()

income
0    7588
1    2412
Name: count, dtype: int64

In [43]:
X_train,X_test,y_train,y_test=train_test_split(total_labeled.copy().drop(columns=["income"]),total_labeled["income"],test_size=0.2,random_state=42)
print(type(y_train))  # 应该是 pandas Series
print(y_train.shape)  # 应该是 (8000,)
print(y_train.unique())  # 应该只有两个值，比如 [0, 1] 或 ['<=50K', '>50K']


<class 'pandas.core.series.Series'>
(8000,)
[1 0]


In [53]:
print(X_train.shape)  # 应该是 (8000, 14)

(8000, 14)


In [44]:
best_params, best_score= xgb_clf_search(X_train, y_train, n_iter=100)
model = XGBClassifier(
    objective="binary:logistic",
    n_jobs=-1,
    eval_metric="mlogloss",
    random_state=42,
    **best_params
)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Best Parameters:
colsample_bytree: 0.9192602425610932
gamma: 0.4691702284105189
learning_rate: 0.06436991984969805
max_depth: 5
min_child_weight: 1
n_estimators: 139
reg_alpha: 0.7215965507512772
reg_lambda: 0.04809464396408769
subsample: 0.9344543445254244
Best Accuracy: 0.8663755271401277


In [45]:
model = XGBClassifier(
    objective="binary:logistic",
    n_jobs=-1,
    eval_metric="mlogloss",
    random_state=42,
    **best_params
)
model.fit(X_train, y_train)
# predict
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
# evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# save model
model.save_model("xgb_model.json")

Accuracy: 0.8625
Precision: 0.8579739054644914
Recall: 0.8625
F1 Score: 0.8557751059293445
Confusion Matrix:
 [[1415   76]
 [ 199  310]]


In [50]:
total_unlabeled=pd.read_csv("data/total_unlabeled.csv")
total_unlabeled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39
3,37,4,284582,12,14,2,4,5,4,0,0,0,40,39
4,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39


In [51]:
len(total_labeled.columns)

15

In [52]:
len(total_unlabeled.columns)

14

In [54]:
total_unlabeled.shape

(38842, 14)

In [55]:
income_pred=model.predict(total_unlabeled)
income_pred_proba=model.predict_proba(total_unlabeled)
total_unlabeled["income"]=income_pred
total_unlabeled["income_proba"]=income_pred_proba[:,1]
total_unlabeled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_proba
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0,0.049414
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0,0.034008
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0,0.122203
3,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,1,0.761093
4,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39,1,0.958356


In [56]:
total_unlabeled.to_csv("data/total_unlabeled_pred.csv",index=False)