In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
from sklearn import metrics

# load data
data = pd.read_csv('dataset.csv')

# split data into X and y

data.info()
enc=LabelEncoder()
for x in data.columns:
    if data[x].dtype=='object':
        data[x]=enc.fit_transform(data[x])
data.info()

y=data['HeartDisease']

X=data.drop(['HeartDisease'],axis=1)

# split data into train and test sets
seed = 7
test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed, shuffle = True, stratify = y)

# fit model to training data

# for an imbalanced binary classification dataset, the negative class refers to the majority class (class 0) and the positive class refers to the minority class (class 1)

# выбранная модель
model = XGBClassifier(scale_pos_weight=10.68, n_estimators=400, learning_rate=0.15, max_depth=10, objective='binary:logistic', booster='gbtree')

# printing model parameters
print(model)

print("Fitting:")
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=False)

print(model)

# make predictions for test data
print("Predicting:")
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" %(accuracy * 100.0))

# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()

# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()

# measuring accuracy on testing data
print("\nClassification report for prediction:")
print(metrics.classification_report(y_test, y_pred))
print("\nConfusion matrix for prediction:")
print(metrics.confusion_matrix(y_test, y_pred))

index=['BMI','Smoking','AlcoholDrinking','Stroke','PhysicalHealth',
'MentalHealth','DiffWalking','Gender','AgeCategory','Race','Diabetic',
'PhysicalActivity','GenHealth','SleepTime','Asthma','KidneyDisease','SkinCancer']

feature_importances = pd.Series(model.feature_importances_, index=index)
plt.bar(range(len(feature_importances)), feature_importances)
plt.xlabel('Features')
plt.ylabel('Significance, %')
plt.title('XGBoost Feature Importance')
plt.show()

print("\nFeature significance")
for i in range(len(feature_importances)):
    print(index[i],":", feature_importances[i])