# Elle Nguyen - Section 02
## Final Project: Prediction Model of COVID-19 Case Surveillance

### Importing required Python Packages and Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import xgboost as xgb
from datetime import timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error, precision_recall_fscore_support, classification_report, confusion_matrix, roc_curve, auc   
from statsmodels.tsa.api import Holt, SimpleExpSmoothing, ExponentialSmoothing
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from random import shuffle
from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Data Preprocessing

Load data with pandas

In [2]:
df = pd.read_csv('COVID-19_Case_Surveillance_Public_Use_Data.csv')

Check whether there are missing values

In [3]:
df.isnull().sum()

cdc_case_earliest_dt              0
cdc_report_dt               8645152
pos_spec_dt                53798621
onset_dt                   59881586
current_status                    0
sex                               7
age_group                        56
race_ethnicity_combined           7
hosp_yn                           0
icu_yn                            0
death_yn                          0
medcond_yn                        0
dtype: int64

Check whether there are categorical features

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96649487 entries, 0 to 96649486
Data columns (total 12 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   cdc_case_earliest_dt     object
 1   cdc_report_dt            object
 2   pos_spec_dt              object
 3   onset_dt                 object
 4   current_status           object
 5   sex                      object
 6   age_group                object
 7   race_ethnicity_combined  object
 8   hosp_yn                  object
 9   icu_yn                   object
 10  death_yn                 object
 11  medcond_yn               object
dtypes: object(12)
memory usage: 8.6+ GB


There are 96,649,487 total entries. The highest missing values recorded is 59,881,586 which is more than 50% of the total entries; therefore, it is necessary to drop 2 columns pos_spec_dt and onset_dt due to having too many missing values.

In [5]:
df.drop(['pos_spec_dt', 'onset_dt'], axis=1, inplace=True)

Dropping rows having at least 1 missing value

In [None]:
df.dropna()

Take a look at the last 4 columns hosp_yn, icu_yn, death_yn, and medcond_yn that have some assigned entries as 'Missing' and 'Unknown'. A new dataframe will store these unknown values for predictions at the end.

In [None]:
df_predict = df[df['death_yn'].isin(['Missing', 'Unknown'])]

In [None]:
df_predict.to_csv('data_predict.csv', index=False)

Then remove these rows for data processing.

In [None]:
df = df[~df.isin(['Missing', 'Unknown']).any(axis=1)]

Last dataset analysis

In [None]:
df.info()

In [None]:
df.isnull().sum()

The current data frame is now carefully filtered and processed to get rid of all missing values. It will be stored into a new csv file to save memory.

In [None]:
df.to_csv('preprocessed_covid.csv', index=False)

### Data Visualization

Load new data with pandas

In [None]:
new_df = pd.read_csv('preprocessed_covid.csv')

The filtered data frame now has 1,222,658 rows and 10 columns.

In [None]:
new_df

In [None]:
plt.figure(figsize=(3, 3))
new_df['current_status'].value_counts().plot(kind='barh', width=0.5)
plt.title('Case Status')
plt.xlabel('Number of Cases')
plt.show()

As shown in the graph above, most of the cases are confirmed by laboratory testings while the remaining have had a confirmatory test performed but has a positive antigen test.

In [None]:
new_df['current_status'].value_counts()

Based on the second pie chart for age_group, most of the confirmed cases come from the 50-59 years (13.6%) and the 0-9 years occupy the least percentage of 6.1%.

In [None]:
# Storing 2 columns sex and age_group
combined_cols = new_df.columns[3:5]

# Draw a pie plot for each column
for col in combined_cols:
    plt.figure(figsize=(4, 4))
    new_df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', fontsize=10, title='', ylabel='')
    plt.title(col)
    plt.show()

In [None]:
plt.figure(figsize=(3, 3))
new_df['race_ethnicity_combined'].value_counts().plot(kind='bar', width=0.5)
plt.title('Race and Ethnicity')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of Cases')
plt.show()

Most cases also do not have any medical backgrounds, never been hospitalized nor admitted to the ICU.

In [None]:
# Grouping the columns to plot
cols = ['hosp_yn', 'icu_yn', 'medcond_yn']

# Calculate the occurrences for each category in each column
counts = new_df[cols].apply(pd.Series.value_counts)

# Transpose the DataFrame to have the categories as the index and the columns as the dates
counts = counts.T

# Create the plot
ax = counts.plot(kind='barh', stacked=True, figsize=(8, 4))

# Set the title and axis labels
ax.set_title('Medical Records')
ax.set_xlabel('Number of Cases')

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
new_df['death_yn'].value_counts().plot(kind='pie', autopct='%1.1f%%', fontsize=10, title='', ylabel='')
plt.title('Death Status')
plt.show()

With a small death percentage of 5.7% with 69,719 death cases, it can be concluded that most people without a medical background or any underlying presence of disease are found to be recovering from COVID-19.

In [None]:
new_df['death_yn'].value_counts()

### Datewise Analysis

Converting the first 2 columns cdc_case_earliest_dt and cdc_report_dt to datetime format.

In [None]:
new_df['cdc_case_earliest_dt '] = pd.to_datetime(new_df['cdc_case_earliest_dt '], format='%d-%m-%Y', infer_datetime_format=True)
new_df['cdc_case_earliest_dt '] = new_df['cdc_case_earliest_dt '].dt.strftime('%d-%m-%Y')

new_df['cdc_report_dt'] = pd.to_datetime(new_df['cdc_report_dt'], format='%d-%m-%Y', infer_datetime_format=True)
new_df['cdc_report_dt'] = new_df['cdc_report_dt'].dt.strftime('%d-%m-%Y')

In [None]:
new_df

Sorting the first column cdc_case_earliest_dt in ascending order.

In [None]:
new_df = new_df.sort_values(by='cdc_case_earliest_dt ')

In [None]:
new_df

In [None]:
print(f"Total number of Confirmed Cases: {len(new_df['cdc_case_earliest_dt '])}")

In [None]:
def monthly_cases(new_df):
    # Convert date columns to datetime format
    new_df['cdc_case_earliest_dt '] = pd.to_datetime(new_df['cdc_case_earliest_dt '], format='%d-%m-%Y', infer_datetime_format=True)
    new_df['cdc_report_dt'] = pd.to_datetime(new_df['cdc_report_dt'], format='%d-%m-%Y')

    # Group by month and count cases
    cases_by_month = new_df.groupby(new_df['cdc_case_earliest_dt '].dt.to_period('M')).size()

    # Plot line graph
    plt.plot(cases_by_month.index.to_timestamp(), cases_by_month.values)
    plt.xticks(rotation='vertical')
    plt.xlabel('Month')
    plt.ylabel('Number of Cases')
    plt.title('Total Cases by Month')
    plt.show()

The below graph is right-skewed distribution (the concentration of data points towards the right tail more than the left tail). This indicates that most positive cases were detected towards the end of 2020 (around the months October and November of 2020). 

In [None]:
monthly_cases(new_df)

In [None]:
def yearly_cases(new_df):
    # Convert date columns to datetime format
    new_df['cdc_case_earliest_dt '] = pd.to_datetime(new_df['cdc_case_earliest_dt '], format='%d-%m-%Y', infer_datetime_format=True)
    new_df['cdc_report_dt'] = pd.to_datetime(new_df['cdc_report_dt'], format='%d-%m-%Y')
    
    # Group by year and count cases
    cases_by_year = new_df.groupby(new_df['cdc_case_earliest_dt '].dt.year)['cdc_case_earliest_dt '].count()
    
    # Plot line graph
    plt.plot(cases_by_year.index, cases_by_year.values)
    plt.xlabel('Year')
    plt.ylabel('Number of Cases')
    plt.title('Total Cases by Year')
    plt.show()

Looking at the cases year-wise, the graph has a descending trend line with a negative slope. The highest number of cases were reported in 2020 and significantly decreased since then with almost none reported in 2023.

In [None]:
yearly_cases(new_df)

### Label Encoder: Converting all categorical features to numerical features

After applying Label Encoder to 8 categorical columns, the categories are listed as:  
**1. current_status**  
> Laboratory-confirmed case = 0  
> Probable Case = 1  

**2. sex**  
> Female = 0  
> Male = 1  
> Other = 2  

**3. age_group**  
> 0 - 9 Years = 0  
> 10 - 19 Years = 1  
> 20 - 29 Years = 2  
> 30 - 39 Years = 3  
> 40 - 49 Years = 4  
> 50 - 59 Years = 5  
> 60 - 69 Years = 6  
> 70 - 79 Years = 7  
> 80 - 89 Years = 8  

**4. race_ethnicity_combined**  
> American Indian/Alaska Native, Non-Hispanic = 0  
> Asian, Non-Hispanic = 1  
> Black, Non-Hispanic = 2  
> Hispanic/Latino = 3  
> Multiple/Other, Non-Hispanic = 4  
> Native Hawaiian/Other Pacific Islander, Non-Hispanic = 5  
> White, Non-Hispanic = 6  

**5. hosp_yn, icu_yn, death_yn, medcond_yn**  
> No = 0  
> Yes = 1

Apply label encoding to all columns except for the first 2 date columns.

In [None]:
labelencoder = LabelEncoder()

for col in new_df.columns[2:]:
    new_df[col] = labelencoder.fit_transform(new_df[col])
    print(col, labelencoder.classes_)

In [None]:
new_df

### Logistic Regression model for binary classification

**Since the goal is to build the survival prediction model, logistic regression is used to predict death_yn as a binary classification model.**

Store the new dataframe without the first 2 date columns.

In [None]:
lr_df = new_df.iloc[:, -8:]
lr_df

Store the converted dataframe into a new csv file.

In [None]:
lr_df.to_csv('encoded_covid.csv', index=False)

Split the preprocessed dataset into 76% training set and 24% testing set in order to use 10-fold cross-validation.

In [None]:
X = lr_df.drop('death_yn', axis=1).values
y = lr_df['death_yn'].values

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.24, 
                                                            random_state=0)

print("train_val: {}, test: {}".format(X_train_val.shape[0], X_test.shape[0]))

Normalize features using Min-Max scaling.

In [None]:
normalizer = StandardScaler()
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

Train the logistic regression model.

In [None]:
# Use the 10-fold cross-validation to select the hyperparameter λ
folds = 10

# Get the number of samples in the training and validation set
num_train_val = X_train_val.shape[0] 

# Shuffle the index of samples in the train_val set
index_of_samples = np.arange(num_train_val) 
shuffle(index_of_samples)

# Split the index of the train_valid set into 10 folds
index_of_folds = index_of_samples.reshape(folds, -1)
index_of_folds

Select the best hyperparameter with cross-validation.

In [None]:
# Search λ
regularization_coefficient = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 50, 100]

best_acc = 0.0
best_reg = 0.0

for reg in regularization_coefficient:
    
    # 10-fold cross-validation
    sum_acc = 0.0
    
    for fold in range(folds):
        
        index_of_folds_temp = index_of_folds.copy()
        
        valid_index = index_of_folds_temp[fold,:].reshape(-1) # Get the index of the validation set
        train_index = np.delete(index_of_folds_temp, fold, 0).reshape(-1) # Get the index of the training set
        
        # Training set
        X_train = X_train_val[train_index]
        y_train = y_train_val[train_index]
        
        # Validation set
        X_valid = X_train_val[valid_index]
        y_valid = y_train_val[valid_index]
                
        # Build the model with different hyperparameters
        clf = LogisticRegression(penalty='l2', C=reg, solver='lbfgs')
        
        # Train the model with the training set
        clf.fit(X_train, y_train)
        
        y_valid_pred = clf.predict(X_valid)
        acc = accuracy_score(y_valid, y_valid_pred)
        
        sum_acc += acc
    
    cur_acc = sum_acc / folds
    
    print("reg_coeff: {}, acc: {:.3f}".format(1.0/reg, cur_acc))
    
    # Store the best hyperparameter
    if cur_acc > best_acc:
        best_acc = cur_acc
        best_reg = reg

print("The best hyperparameter is {}".format(best_reg))

Retrain the model.

In [None]:
clf = LogisticRegression(penalty='l2', C=best_reg, solver='lbfgs')
clf.fit(X_train_val, y_train_val)

Evaluate the learned model on the testing set with accuracy, recall, precision, F1 score, and ROC_AUC score.

In [None]:
y_test_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

**Conclusion:**  
> Accuracy (0.955) means that 95.5% of the predictions were correct.  
> Recall (0.446) identified 44.6% of the actual positive cases.  
> Precision (0.661) means that when the model predicted a positive case, it was correct 66.1% of the time.  
> F1 score (0.533) indicates an average overall model performance. Higher values yield better performance.  

**These values suggest that the model has a high accuracy, but relatively low recall, precision, and F1 score. This means that the model is good at predicting negative cases, but not as good at predicting positive cases. Such performance aligns with the imbalanced data distribution: 94.3% recover from COVID while 5.7% actually died.**

Create an empty dataframe to store the performance metrics.

In [None]:
results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC_AUC score'])

results = results.append({'Model': 'Logistic Regression', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

Visualize the importance of each model parameter.

In [None]:
fig = plt.figure(figsize=(16,8))
plt.title("Learned model parameter vector w")
columns_to_plot = [col for col in lr_df.columns[:] if col != 'death_yn']
plt.bar(columns_to_plot, clf.coef_[0])
plt.show()

The absolute magnitude of each model parameter corresponds to the strength of the relationship between that feature and the target variable. Large absolute values indicate that those features have a strong influence on the predicted outcome such as **age_group** and **hosp_yn**. It's easy to predict who is likely to die from COVID based on these data because such learned vectors hugely impact our prediction. On the other hand, small absolute values indicate that the corresponding features have little to no effect on the outcome such as **current_status** and **sex** - these features do not relate to the disease, thus hard to make predictions.

In [None]:
fig = plt.figure(figsize=(16,8))
plt.title("Absolute Learned model parameter vector w")
columns_to_plot = [col for col in lr_df.columns[:] if col != 'death_yn']
plt.bar(columns_to_plot, abs(clf.coef_[0]))
plt.show()

Due to a bad performance of logistic regression, it's necessary to switch to other prediction models to yield better outcomes.

### Synthetic Minority Oversampling Technique (SMOTE)

According to Logistic Regression performance alone, its relatively low recall, precision, and F1 score align with the imbalanced dataset. Therefore, SMOTE is used for data augmentation for the minority class (which is the positive case in this dataset).

Apply SMOTE to oversample the minority class.

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_val, y_train_val)

Normalize the features using Min-Max scaling.

In [None]:
X_train_res = normalizer.fit_transform(X_train_res)
X_test = normalizer.transform(X_test)

Retrain the logistic regression model on the resampled data.

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train_res, y_train_res)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

Store the model performance.

In [None]:
results = results.append({'Model': 'Logistic Regression with SMOTE', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

Get the confusion matrix and classification report.

In [None]:
confusion_mat = confusion_matrix(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)

print("Confusion matrix:\n", confusion_mat)
print("Classification report:\n", report)

Visualize ROC curve and AUC (Area under ROC curve).

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

AUC is approximately 1.0, indicating a good model performance. However, the precision score on Class 1 is still relatively low; therefore, it is neccesary to test out other models then compare each performance.

### Decision Tree Classifier

Split into training and testing set.

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.24, 
                                                            random_state=0)

Normalize the features using Min-Max scaling.

In [None]:
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

Train the decision tree model.

In [None]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train_val, y_train_val)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = dtc.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

Store the model performance.

In [None]:
results = results.append({'Model': 'Decision Tree', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

Apply SMOTE.

In [None]:
X_train_res, y_train_res = smote.fit_resample(X_train_val, y_train_val)

Normalize the features using Min-Max scaling.

In [None]:
X_train_res = normalizer.fit_transform(X_train_res)
X_test = normalizer.transform(X_test)

Retrain using the decision tree model on the resampled data.

In [None]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train_res, y_train_res)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = dtc.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

Store the model performance.

In [None]:
results = results.append({'Model': 'Decision Tree with SMOTE', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

### Random Forest Classifier

Split into training and testing set.

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.24, 
                                                            random_state=0)

Normalize the feature using Min-Max scaling.

In [None]:
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

Create and train a random forest classifier.

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rfc.fit(X_train_val, y_train_val)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = rfc.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

Store the model performance.

In [None]:
results = results.append({'Model': 'Random Forest', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

Apply SMOTE.

In [None]:
X_train_res, y_train_res = smote.fit_resample(X_train_val, y_train_val)

Normalize the features using Min-Max scaling.

In [None]:
X_train_res = normalizer.fit_transform(X_train_res)
X_test = normalizer.transform(X_test)

Retrain using the Random Forest model on the resampled data.

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rfc.fit(X_train_res, y_train_res)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = dtc.predict(X_test)

acc = accuracy_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_pred)

print("accuracy: {:.5f}, recall: {:.5f}, precision: {:.5f}, f1: {:.5f}, roc_auc: {:.5f}".format(acc, recall, precision, f1, roc_auc))

Store the model performance.

In [None]:
results = results.append({'Model': 'Random Forest with SMOTE', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

### XGBoost Classifier

Split into training and testing set.

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                            test_size=0.24, 
                                                            random_state=0)

Normalize the features using Min-Max scaling.

In [None]:
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

Convert data to DMatrix format for XGBoost.

In [None]:
dtrain = xgb.DMatrix(X_train_val, label=y_train_val)
dtest = xgb.DMatrix(X_test, label=y_test)

Set parameters for XGBoost model.

In [None]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'logloss',
    'verbosity': 0,
    'seed': 42
}

Train the model.

In [None]:
num_round = 100
bst = xgb.train(params, dtrain, num_round)

Evaluate the model on the testing set with accuracy, precision, recall, and F1 score.

In [None]:
y_test_pred = bst.predict(dtest)

Convert predicted probabilities to binary values

In [None]:
y_test_pred_binary = [1 if p >= 0.5 else 0 for p in y_test_pred]

In [None]:
acc = accuracy_score(y_test, y_test_pred_binary)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred_binary, average='binary')
roc_auc = roc_auc_score(y_test, y_test_pred_binary)

print("accuracy: %.5f%%" % (acc * 100.0))
print("precision: %.5f" % precision)
print("recall: %.5f" % recall)
print("f1 score: %.5f" % f1)

Store the model performance.

In [None]:
results = results.append({'Model': 'XGBoost', 
                          'Accuracy': acc, 
                          'Precision': precision, 
                          'Recall': recall, 
                          'F1 score': f1, 
                          'ROC_AUC score': roc_auc}, ignore_index=True)

### Model Performance Outcome

In [None]:
results

Sort in ascending order to get the highest prediction accuracy.

In [None]:
results.sort_values('Accuracy', ascending=False)

### Prediction Model