## 1. Import Libraries

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

### 1a. Load data

In [2]:
df = pd.read_csv('modelingdf.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckup,ExerciseLast30days,SleepTime,HadAngina,HadStroke,...,EcigUsage,HadCovid,HeartDisease,RaceEthnicityGroup,AgeGroup5yrs,Height,Weight,BMI,DrinkOccasionsPerDay,Smoked
1,1,0,5,0.0,0.0,0,0.0,6.0,0.0,0.0,...,0.0,0.0,0,0,12,1.6,68.04,26.57,0.0,0.0
2,1,0,4,2.0,3.0,1,1.0,5.0,0.0,0.0,...,0.0,1.0,0,0,7,1.57,63.5,25.61,0.0,0.0
3,1,0,5,0.0,0.0,1,1.0,7.0,0.0,0.0,...,0.0,0.0,0,0,9,1.65,63.5,23.3,0.0,1.0
4,1,0,2,2.0,0.0,1,1.0,9.0,0.0,0.0,...,0.0,0.0,0,0,4,1.57,53.98,21.77,10.0,0.0
5,1,1,1,1.0,0.0,1,0.0,7.0,0.0,1.0,...,0.0,0.0,1,0,12,1.8,84.82,26.08,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356169 entries, 1 to 445131
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   State                 356169 non-null  int64  
 1   Sex                   356169 non-null  int64  
 2   GeneralHealth         356169 non-null  int64  
 3   PhysicalHealthDays    356169 non-null  float64
 4   MentalHealthDays      356169 non-null  float64
 5   LastCheckup           356169 non-null  int64  
 6   ExerciseLast30days    355543 non-null  float64
 7   SleepTime             356169 non-null  float64
 8   HadAngina             355160 non-null  float64
 9   HadStroke             355376 non-null  float64
 10  HadCOPD               354858 non-null  float64
 11  HadKidneyDisease      355028 non-null  float64
 12  HadArthritis          354457 non-null  float64
 13  HadDiabetes           355656 non-null  float64
 14  Deaf                  355111 non-null  float64
 15  

## 2. Prep Data

In [5]:
#Features + Target datasets
X = df.drop(columns =['HeartDisease'], axis = 1)
y = df['HeartDisease']

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Define

In [7]:
def beta_2_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    beta = 2
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

In [8]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    beta_2 = beta_2_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)
    
    return {
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'beta_2' :beta_2,
        'auc': auc,
        'cm': cm
    }


# Testing for scaler

## RANDOM FOREST

In [9]:
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
    'MaxAbsScaler': MaxAbsScaler()
}

# Dictionary to store evaluation results
results_rf = {}

for scaler_name, scaler in scalers.items():
    # Define the pipeline with the current scaler
    pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
        ('scaler', scaler),  # Apply the current scaler
        ('smote', SMOTE(random_state=42)),  # Apply SMOTE for class balancing
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # KNN classifier
    ])
    
    # Fit the pipeline on training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Compute evaluation metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = report['accuracy']
    recall = report['macro avg']['recall']  # Use 'macro avg' to get the average recall
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    beta_2 = beta_2_score(y_test, y_pred)  # Calculate Beta=2 score
    cm = confusion_matrix(y_test, y_pred)
    ROCAUC = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results_rf[scaler_name] = {
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1,
        'beta_2_score': beta_2,
        'precision': precision,
        'confusion_matrix': cm,
        'ROC AUC Score' :  ROCAUC
    }

# Print the results
for scaler_name, metrics in results_rf.items():
    print(f"{scaler_name}: Accuracy = {metrics['accuracy']:.4f}, Recall = {metrics['recall']:.4f}, F1 Score = {metrics['f1_score']:.4f}, Beta=2 Score = {metrics['beta_2_score']:.4f}, Precision = {metrics['precision']:.4f}, ROC AUC Score = {metrics['ROC AUC Score']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")

StandardScaler: Accuracy = 0.9686, Recall = 0.8364, F1 Score = 0.8014, Beta=2 Score = 0.7195, Precision = 0.9891, ROC AUC Score = 0.9285
Confusion Matrix:
[[64474    50]
 [ 2190  4520]]

MinMaxScaler: Accuracy = 0.9684, Recall = 0.8364, F1 Score = 0.8006, Beta=2 Score = 0.7194, Precision = 0.9863, ROC AUC Score = 0.9287
Confusion Matrix:
[[64461    63]
 [ 2189  4521]]

RobustScaler: Accuracy = 0.9689, Recall = 0.8362, F1 Score = 0.8030, Beta=2 Score = 0.7193, Precision = 0.9962, ROC AUC Score = 0.9271
Confusion Matrix:
[[64507    17]
 [ 2197  4513]]

MaxAbsScaler: Accuracy = 0.9684, Recall = 0.8367, F1 Score = 0.8009, Beta=2 Score = 0.7200, Precision = 0.9856, ROC AUC Score = 0.9292
Confusion Matrix:
[[64458    66]
 [ 2184  4526]]



best scaler for random forest is MaxAbsScaler

## LOGISTIC REGRESSION

In [10]:
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
    'MaxAbsScaler': MaxAbsScaler()
}

# Dictionary to store evaluation results
results_log = {}

for scaler_name, scaler in scalers.items():
    # Define the pipeline with the current scaler
    pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
        ('scaler', scaler),  # Apply the current scaler
        ('smote', SMOTE(random_state=42)),  # Apply SMOTE for class balancing
        ('classifier', LogisticRegression(random_state=42))  # KNN classifier
    ])
    
    # Fit the pipeline on training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Compute evaluation metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = report['accuracy']
    recall = report['macro avg']['recall']  # Use 'macro avg' to get the average recall
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    beta_2 = beta_2_score(y_test, y_pred)  # Calculate Beta=2 score
    cm = confusion_matrix(y_test, y_pred)
    ROCAUC = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results_log[scaler_name] = {
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1,
        'beta_2_score': beta_2,
        'precision': precision,
        'confusion_matrix': cm,
        'ROC AUC Score' :  ROCAUC
    }

# Print the results
for scaler_name, metrics in results_log.items():
    print(f"{scaler_name}: Accuracy = {metrics['accuracy']:.4f}, Recall = {metrics['recall']:.4f}, F1 Score = {metrics['f1_score']:.4f}, Beta=2 Score = {metrics['beta_2_score']:.4f}, Precision = {metrics['precision']:.4f}, ROC AUC Score = {metrics['ROC AUC Score']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")

StandardScaler: Accuracy = 0.9125, Recall = 0.8594, F1 Score = 0.6309, Beta=2 Score = 0.7196, Precision = 0.5234, ROC AUC Score = 0.9371
Confusion Matrix:
[[59672  4852]
 [ 1382  5328]]

MinMaxScaler: Accuracy = 0.9146, Recall = 0.8596, F1 Score = 0.6358, Beta=2 Score = 0.7211, Precision = 0.5312, ROC AUC Score = 0.9369
Confusion Matrix:
[[59835  4689]
 [ 1397  5313]]

RobustScaler: Accuracy = 0.9351, Recall = 0.8559, F1 Score = 0.6877, Beta=2 Score = 0.7284, Precision = 0.6292, ROC AUC Score = 0.9355
Confusion Matrix:
[[61525  2999]
 [ 1622  5088]]

MaxAbsScaler: Accuracy = 0.9141, Recall = 0.8591, F1 Score = 0.6345, Beta=2 Score = 0.7201, Precision = 0.5296, ROC AUC Score = 0.9369
Confusion Matrix:
[[59808  4716]
 [ 1401  5309]]



LOGISTIC REGRESSION BEST SCALER IS ROBUST SCALER

## XGBOOST

In [11]:

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
    'MaxAbsScaler': MaxAbsScaler()
}

# Dictionary to store evaluation results
results_xg = {}

for scaler_name, scaler in scalers.items():
    # Define the pipeline with the current scaler
    pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
        ('scaler', scaler),  # Apply the current scaler
        ('smote', SMOTE(random_state=42)),  # Apply SMOTE for class balancing
        ('classifier', XGBClassifier(eval_metric='logloss', random_state=42))  # XGBoost classifier
    ])
    
    # Fit the pipeline on training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Compute evaluation metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = report['accuracy']
    recall = report['macro avg']['recall']  # Use 'macro avg' to get the average recall
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    beta_2 = beta_2_score(y_test, y_pred)  # Calculate Beta=2 score
    cm = confusion_matrix(y_test, y_pred)
    ROCAUC = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results_xg[scaler_name] = {
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1,
        'beta_2_score': beta_2,
        'precision': precision,
        'confusion_matrix': cm,
        'ROC AUC Score' :  ROCAUC
    }

# Print the results
for scaler_name, metrics in results_xg.items():
    print(f"{scaler_name}: Accuracy = {metrics['accuracy']:.4f}, Recall = {metrics['recall']:.4f}, F1 Score = {metrics['f1_score']:.4f}, Beta=2 Score = {metrics['beta_2_score']:.4f}, Precision = {metrics['precision']:.4f}, ROC AUC Score = {metrics['ROC AUC Score']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



StandardScaler: Accuracy = 0.9688, Recall = 0.8365, F1 Score = 0.8026, Beta=2 Score = 0.7198, Precision = 0.9930, ROC AUC Score = 0.9370
Confusion Matrix:
[[64492    32]
 [ 2191  4519]]

MinMaxScaler: Accuracy = 0.9688, Recall = 0.8368, F1 Score = 0.8030, Beta=2 Score = 0.7203, Precision = 0.9930, ROC AUC Score = 0.9373
Confusion Matrix:
[[64492    32]
 [ 2187  4523]]

RobustScaler: Accuracy = 0.9690, Recall = 0.8365, F1 Score = 0.8035, Beta=2 Score = 0.7199, Precision = 0.9963, ROC AUC Score = 0.9368
Confusion Matrix:
[[64507    17]
 [ 2193  4517]]

MaxAbsScaler: Accuracy = 0.9688, Recall = 0.8365, F1 Score = 0.8029, Beta=2 Score = 0.7199, Precision = 0.9938, ROC AUC Score = 0.9377
Confusion Matrix:
[[64496    28]
 [ 2191  4519]]



best scaler for XGBoost is MinMaxScaler

# Hyperparater tuning

## Random Forest

In [18]:
# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100],  # Number of trees in the forest
    'classifier__max_depth': [10],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    #'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    #'classifier__bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Define the pipeline
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', MaxAbsScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier())
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2  # Print progress
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Recall Score: {best_score:.4f}')

# Evaluate on the test set
best_knn_model = grid_search.best_estimator_
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)[:, 1]
f1 = f1_score(y_test, y_pred)
beta_2 = beta_2_score(y_test, y_pred)
print(f'Test Recall: {recall_score(y_test, y_pred):.4f}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
print(f'f1_score: {f1}')
print(f'beta_2: {beta_2}')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best Recall Score: 0.7223
Test Recall: 0.7201
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     64524
           1       0.77      0.72      0.74      6710

    accuracy                           0.95     71234
   macro avg       0.87      0.85      0.86     71234
weighted avg       0.95      0.95      0.95     71234

f1_score: 0.7424708051628764
beta_2: 0.7288963977553853


In [16]:
# Define the parameter grid
param_grid = {
    'classifier__C': [1],               # Regularization strength
    'classifier__penalty': [ 'l1'],    # Type of regularization
    'classifier__solver': [ 'saga']  # Optimization algorithm
}

# Define the pipeline
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('scaler', RobustScaler()),  # Apply StandardScaler
    ('smote', SMOTE(random_state=42)),  # Apply SMOTE for class balancing
    ('classifier', LogisticRegression(random_state=42))  # Logistic Regression classifier
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2  # Print progress
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Recall Score: {best_score:.4f}')

# Evaluate on the test set
best_log_reg_model = grid_search.best_estimator_
y_pred = grid_search.best_estimator_.predict(X_test)
y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)[:, 1]
f1 = f1_score(y_test, y_pred)
beta_2 = beta_2_score(y_test, y_pred)
print(f'Test Recall: {recall_score(y_test, y_pred):.4f}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
print(f'f1_score: {f1}')
print(f'beta_2: {beta_2}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Best Recall Score: 0.7623
Test Recall: 0.7574
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     64524
           1       0.63      0.76      0.69      6710

    accuracy                           0.94     71234
   macro avg       0.80      0.86      0.83     71234
weighted avg       0.94      0.94      0.94     71234

f1_score: 0.6887578776174019
beta_2: 0.7283515349557141




In [14]:
from xgboost import XGBClassifier
# Define the parameter distribution
param_dist = { 
    #'classifier__n_estimators': np.arange(50, 201, 50),           # Number of boosting rounds, 200
    #'classifier__learning_rate': np.logspace(-3, -1, 5),          # Learning rate, 0.1
    #'classifier__max_depth': np.arange(3, 10, 1),                 # Maximum depth of a tree, 9 
    #'classifier__subsample': np.linspace(0.6, 1.0, 5),            # Subsample ratio of the training instance,0.6
    #'classifier__colsample_bytree': np.linspace(0.6, 1.0, 5),     # Subsample ratio of columns when constructing each tree, 0.6
    #'classifier__gamma': np.linspace(0, 0.4, 5)                   # Minimum loss reduction required to make a further partition,0.1
}

# Define the pipeline
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('scaler', MaxAbsScaler()),  # Apply StandardScaler
    ('smote', SMOTE(random_state=42)),  # Apply SMOTE for class balancing
    ('classifier', XGBClassifier(n_estimators =200,learning_rate = 0.01,max_depth = 6, subsample=0.7,colsample_bytree=0.6
                                 ,use_label_encoder=False,gamma= 0.1, eval_metric='logloss', random_state=42))  # XGBoost classifier
])

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,  # Number of parameter settings to sample
    scoring='recall',
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2,  # Print progress
    random_state=42  # Seed for reproducibility
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Cross-Validation Recall: {best_score:.4f}')

# Evaluate on the test set
best_xgb_model = random_search.best_estimator_
y_pred = random_search.best_estimator_.predict(X_test)
y_pred_proba = random_search.best_estimator_.predict_proba(X_test)[:, 1]
print(f'Test Recall: {recall_score(y_test, y_pred):.4f}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
f1 = f1_score(y_test, y_pred)
beta_2 = beta_2_score(y_test, y_pred)
print(f'f1_score: {f1}')
print(f'beta_2: {beta_2}')



Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {}
Best Cross-Validation Recall: 0.7022
Test Recall: 0.7001
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     64524
           1       0.85      0.70      0.77      6710

    accuracy                           0.96     71234
   macro avg       0.91      0.84      0.87     71234
weighted avg       0.96      0.96      0.96     71234

f1_score: 0.7670204081632653
beta_2: 0.7254478072884495


## Evaluate

In [19]:
knn_results = evaluate_model(best_knn_model, X_test, y_test)
log_reg_results = evaluate_model(best_log_reg_model, X_test, y_test)
xgb_results = evaluate_model(best_xgb_model, X_test, y_test)

In [20]:
# Print the results
print('Accuracy:', knn_results['acc'])
print('Precision:', knn_results['prec'])
print('Recall:', knn_results['rec'])
print('F1 Score:', knn_results['f1'])
print('beta_2',knn_results['beta_2'])
print('Area Under Curve:', knn_results['auc'])
print('Confusion Matrix:\n', knn_results['cm'])

Accuracy: 0.9529438189628548
Precision: 0.7662543609261021
Recall: 0.7201192250372578
F1 Score: 0.7424708051628764
beta_2 0.7288963977553853
Area Under Curve: 0.9357186390100942
Confusion Matrix:
 [[63050  1474]
 [ 1878  4832]]


In [21]:
print('Accuracy:', log_reg_results['acc'])
print('Precision:', log_reg_results['prec'])
print('Recall:', log_reg_results['rec'])
print('F1 Score:', log_reg_results['f1'])
print('beta_2',log_reg_results['beta_2'])
print('Area Under Curve:', log_reg_results['auc'])
print('Confusion Matrix:\n', log_reg_results['cm'])

Accuracy: 0.9355223629165847
Precision: 0.6315397042376041
Recall: 0.7573770491803279
F1 Score: 0.6887578776174019
beta_2 0.7283515349557141
Area Under Curve: 0.9354267883640104
Confusion Matrix:
 [[61559  2965]
 [ 1628  5082]]


In [22]:
print('Accuracy:', xgb_results['acc'])
print('Precision:', xgb_results['prec'])
print('Recall:', xgb_results['rec'])
print('F1 Score:', xgb_results['f1'])
print('beta_2',xgb_results['beta_2'])
print('Area Under Curve:', xgb_results['auc'])
print('Confusion Matrix:\n', xgb_results['cm'])

Accuracy: 0.9599348625656288
Precision: 0.848014440433213
Recall: 0.7001490312965722
F1 Score: 0.7670204081632653
beta_2 0.7254478072884495
Area Under Curve: 0.9343119038136067
Confusion Matrix:
 [[63682   842]
 [ 2012  4698]]
