# Import software libraries

In [None]:
# Import required libraries.
import sys                                                  # Read system parameters.
import numpy as np                                          # Work with multi-dimensional arrays.
import pandas as pd                                         # Manipulate and analyze data.
import matplotlib                                           # Create and format charts.
import matplotlib.pyplot as plt  
import seaborn as sns                                       # Make charting easier.
import sklearn                                              # Train and evaluate machine learning models.
from sklearn.model_selection import train_test_split, \
                                    learning_curve, \
                                    cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, \
                            confusion_matrix, \
                            classification_report, \
                            scorer, \
                            f1_score, \
                            recall_score, \
                            precision_score, \
                            roc_auc_score, \
                            plot_roc_curve, \
                            plot_precision_recall_curve, \
                            plot_confusion_matrix
from sklearn.dummy import DummyClassifier
import xgboost                                              # Build gradient boosting models.
from xgboost import XGBClassifier
import pickle                                               # Save Python objects as binary files.
from collections import Counter
import warnings                                             # Suppress warnings.
warnings.filterwarnings('ignore')

# Ensure results are reproducible.
np.random.seed(1)

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Seaborn {}'.format(sns.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))
print('- XGBoost {}'.format(xgboost.__version__))

# Read and examine the data

In [None]:
# Read the data.



# Preview the first five rows of the data.



In [None]:
# Check the structure of the data.



# Prepare the data

In [None]:
# Define the target variable and get the count of each value in the variable.





In [None]:
# Split the data into target and features.




In [None]:
# Split the dataset into separate training and testing sets.






# Get the shape of both the training dataset and the test dataset.









In [None]:
# Use the Counter library to get the count of each value in the target variable (test data).



# Train a logistic regression model

In [None]:
# Normalize the training data.




In [None]:
# Create a LogisticRegression() model and fit it on the scaled training data.




In [None]:
# Make predictions on the test data.



# Get a count of each prediction value.



# Perform a quick evaluation of the logistic regression model

In [None]:
# Obtain the accuracy of the model's predictions.



In [None]:
# Use the classification_report() function to get a table of additional metric scores.



# Train a random forest model

In [None]:
# Create a RandomForestClassifier() model and fit it on the scaled training data.




In [None]:
# Make predictions on the test data.



# Get a count of each prediction value.



# Perform a quick evaluation of the logistic regression model

In [None]:
# Obtain the accuracy of the model's predictions.



In [None]:
# Use the classification_report() function to get a table of additional metric scores.



# Compare evaluation metrics for each model

In [None]:
# List will hold model objects.

models = []

# DummyClassifier() used as a baseline algorithm.

models.append(('Dummy Classifier', DummyClassifier(strategy = 'stratified')))

# Logistic Regression model.

models.append(('Logistic Regression', LogisticRegression()))

# Random Forest model.

models.append(('Random Forest', RandomForestClassifier()))

# XGBoost model.

models.append(('XGBoost', XGBClassifier(eval_metric = 'logloss', n_jobs = 1)))

In [None]:
# List will hold dictionaries of model scores.

scoring_df = []

# Train each model in the list and output multiple scores for each model.

for name, model in models:
    if name in ['Logistic Regression']:
        X_train_1 = X_train_norm
    else:
        X_train_1 = X_train
    
    model.fit(X_train_1, y_train)
    
    y_pred = model.predict(X_test)

    # Calcualte the evaluation metrics for the model.
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)

    scoring_dict = {'Model': name,
                    'Accuracy': round(accuracy, 4), 
                    'F1 Score': round(f1, 4), 
                    'Precision' : round(precision, 4), 
                    'Recall' : round(recall, 4), 
                    'AUC' : round(auc ,4), 
                   }
    
    scoring_df.append(scoring_dict)

In [None]:
# Create a DataFrame from scoring_df.



# Sort the DataFrame by accuracy score (descending), then print it.



# Begin evaluating the best model

In [None]:
# Retrain the model with the highest accuracy score.




In [None]:
# Make predictions on the test data.



# Get a count of each prediction value.



In [None]:
# Plot a ROC curve.






# Generate a confusion matrix of the best model

In [None]:
# Generate a confusion matrix.



In [None]:
# Plot the confusion matrix.









# Generate a feature importance plot for the best model

In [None]:
# This function generates a feature importance plot on a bar chart.

def feature_importance_plot(model, X_train, n):
    """Plots feature importance. This only works for random forest and XGBoost models."""
    
    plt.figure(figsize=(8, 5))  # Set figure size.
    feat_importances = pd.Series(model.feature_importances_,
                                 index = X_train.columns)
    feat_importances.nlargest(n).plot(kind = 'barh')
    plt.title(f'Top {n} Features')
    plt.show()

In [None]:
# Plot the feature importances.



# Plot a learning curve for the best model

In [None]:
# This function generates and plots a learning curve.

def plot_learning_curves(model, X_train, y_train):
    """Plots learning curves for model validation."""
    
    plt.figure(figsize=(5, 5))  # Set figure size.
    train_sizes, train_scores, test_scores = learning_curve(model,
                                                            X_train,
                                                            y_train,
                                                            cv = 5,  # Number of folds in cross-validation.
                                                            scoring = 'accuracy',  # Evaluation metric.
                                                            n_jobs = 1,
                                                            shuffle = True,
                                                            train_sizes = np.linspace(0.01, 1.0, 5))  # 5 different sizes of the training set.

    # Create means and standard deviations of training set scores.
    
    train_mean = np.mean(train_scores, axis = 1)
    train_std = np.std(train_scores, axis = 1)

    # Create means and standard deviations of test set scores.
    
    test_mean = np.mean(test_scores, axis = 1)
    test_std = np.std(test_scores, axis = 1)

    # Draw lines.
    
    plt.plot(train_sizes, train_mean, '--', color = '#111111', label = 'Training score')
    plt.plot(train_sizes, test_mean, color = '#111111', label = 'Cross-validation score')
    
    # Create plot.
    
    plt.title('Learning Curves')
    plt.xlabel('Training Set Size'), plt.ylabel('Accuracy'), plt.legend(loc = 'best')
    plt.tight_layout()
    
    plt.show()

In [None]:
# Call the function to plot learning curves for the best model.



# Save the best model

In [None]:
# Save the best model as a pickle file named best_classification_model.pickle.

