# Import software libraries

In [None]:
import sys                                                  # Read system parameters.
import numpy as np                                          # Work with multi-dimensional arrays.
import pandas as pd                                         # Manipulate and analyze data.
import matplotlib                                           # Create and format charts.
from matplotlib.figure import Figure
import matplotlib.pyplot as plt
import seaborn as sns                                       # Make charting easier.
from matplotlib.backends.backend_agg \
     import FigureCanvasAgg as FigureCanvas
import sklearn                                              # Train and evaluate machine learning models.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, \
                            f1_score, \
                            recall_score, \
                            precision_score
from sklearn.preprocessing import StandardScaler
import scikitplot as skplt                                  # Generate plots from sklearn models.
import flask                                                # Create simple web apps.
from flask import Flask, request, render_template           
import io                                                   # Handle input/output.
import base64                                               # Convert data to Base64.
import pickle                                               # Save Python objects as binary files.
import warnings                                             # Suppress warnings.
warnings.filterwarnings('ignore')

# Ensure results are reproducible.
np.random.seed(1)

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Seaborn {}'.format(sns.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))
print('- scikit-plot {}'.format(skplt.__version__))
print('- Flask {}'.format(flask.__version__))

# Load the users dataset

In [None]:
users_data = pd.read_pickle('data/users_data_final.pickle')

# Load and prepare the classification model

In [None]:
clf = pickle.load(open('models/xgboost_classifier.pickle', 'rb'))

In [None]:
# Separate features from target variable.
target_data = users_data.term_deposit
features = users_data.drop(['user_id', 'term_deposit'], axis = 1)

# Perform holdout method.
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target_data,
                                                    test_size = 0.3)

# Generate predictions and prediction probabilities.
clf_pred = clf.predict(X_test)
clf_probas = clf.predict_proba(X_test)

# Generate scores for eval metrics.
acc = round(accuracy_score(y_test, clf_pred) * 100)
prec = round(precision_score(y_test, clf_pred) * 100)
rec = round(recall_score(y_test, clf_pred) * 100)
f1 = round(f1_score(y_test, clf_pred) * 100)

clf_scores = {'Accuracy': str(acc) + '%',
              'Precision': str(prec) + '%',
              'Recall': str(rec) + '%',
              'F1': str(f1) + '%'}

# Format columns to make them easier to read on charts.
clf_cols = ['Number of Transactions', 'Total Amount (USD)', 'Job: Manager',
            'Job: Technican', 'Job: Entrepreneur', 'Job: Blue Collar',
            'Job: Retired', 'Job: Admin', 'Job: Services', 'Job: Self-Employed',
            'Job: Unemployed', 'Job: Housemaid', 'Job: Student',
            'Education: Tertiary', 'Education: Secondary', 'Education: Unknown',
            'Education: Primary', 'Defaulted on Loan?', 'Housing Loan',
            'Personal Loan', 'Contact: Mobile Device', 'Contact: Landline',
            'Contact Duration', 'Times Contacted (Current)', 'Days Since Last Contacted',
            'Times Contacted (Previous)', 'Device: Desktop',
            'Device: Tablet', 'Single?', 'Age Group', 'Month Joined']

# Generate feature importances.
X_train.columns = clf_cols
clf_feat_importances = pd.Series(clf.feature_importances_,
                                 index = X_train.columns). \
                       nlargest(10)

# Load and prepare the regression model

In [None]:
reg = pickle.load(open('models/xgboost_regressor.pickle', 'rb'))

In [None]:
# Remove outliers.
q1 = np.percentile(users_data.total_amount_usd, 25)
q3 = np.percentile(users_data.total_amount_usd, 75)
iqr = q3 - q1

lb = q1 - 1.5 * iqr
ub = q3 + 1.5 * iqr

users_data_wout_outliers = \
users_data[(users_data.total_amount_usd < ub) \
           & (users_data.total_amount_usd > lb)]

# Separate features from target variable.
target_data = users_data_wout_outliers.total_amount_usd 
features = users_data_wout_outliers.drop(['user_id', 'total_amount_usd'],
                                         axis = 1)

# Perform holdout method.
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target_data,
                                                    test_size = 0.3)

# Generate predictions.
reg_pred = reg.predict(X_test)

# Format columns to make them easier to read on charts.
reg_cols = ['Number of Transactions', 'Job: Manager',
            'Job: Technican', 'Job: Entrepreneur', 'Job: Blue Collar',
            'Job: Retired', 'Job: Admin', 'Job: Services', 'Job: Self-Employed',
            'Job: Unemployed', 'Job: Housemaid', 'Job: Student',
            'Education: Tertiary', 'Education: Secondary', 'Education: Unknown',
            'Education: Primary', 'Defaulted on Loan?', 'Housing Loan',
            'Personal Loan', 'Contact: Mobile Device', 'Contact: Landline',
            'Contact Duration', 'Times Contacted (Current)', 'Days Since Last Contacted',
            'Times Contacted (Previous)', 'Term Deposit Signup?', 'Device: Desktop',
            'Device: Tablet', 'Single?', 'Age Group', 'Month Joined']

# Generate feature importances.
X_train.columns = reg_cols
reg_feat_importances = pd.Series(reg.feature_importances_,
                                 index = X_train.columns). \
                       nlargest(5)

# Track residuals from predictions.
resid_df = pd.DataFrame()
resid_df['total_amount_usd'] = y_test
resid_df['total_pred'] = reg_pred
resid_df['residuals'] = resid_df['total_amount_usd'] - resid_df['total_pred']
resid_df = resid_df.sort_values('total_amount_usd')[::20]
resid_df['record_num'] = np.arange(len(resid_df))

# Load and prepare the clustering model

In [None]:
clust = pickle.load(open('models/kmeans.pickle', 'rb'))

In [None]:
# Get demographics data only.
users_data_demographics = \
users_data.filter(regex = 'education|job|age|single')

# Standardize data.
scaler = StandardScaler()
        
scaler.fit(users_data_demographics)
users_data_scaled = scaler.transform(users_data_demographics)

# Generate predictions.
clust_pred = clust.predict(users_data_scaled)

# Generate frequencies for each cluster.
clust_freqs = {}
total_examples = np.count_nonzero(clust_pred)

# Get age group frequencies for each individual cluster.
for clust_id in range(0, 12):
    count = np.count_nonzero(clust_pred == clust_id)
    avg = (count / total_examples) * 100
    
    clust_freqs[str(clust_id)] = avg

# Assign age bins to their integer values.
age_group_decoded = {0: '18–24', 1: '25–34', 2: '35–44',
                     3: '45–54', 4: '55–64', 5: '65–74', 6: '74+'}

# Generate clustering results.
clust_results = pd.DataFrame(users_data_demographics)
clust_results['age_group_decoded'] = \
clust_results['age_group_encoded'].replace(age_group_decoded)
clust_results.insert(0, 'cluster', clust_pred)

# Set model descriptions

In [None]:
clf_desc = 'This machine learning model predicts whether or not ' \
            'a customer will sign up for a term deposit.'
    
reg_desc = 'This machine learning model predicts the total balance ' \
           'of debit and credit that the customer has with their account.'
    
clust_desc = 'This machine learning model groups like customers together ' \
             'for the purposes of creating useful marketing segments.'

# Set plot style

In [None]:
sns.set(style = 'darkgrid', context = 'notebook',
        palette = ['#1C3863', '#00A0DD', '#F47A21', '#15A766'])

# Construct the web app

In [None]:
app = Flask(__name__)

In [None]:
@app.route('/', methods = ['GET'])
def demo():
    
    # Defaults before task selection.
    which_layout = 'welcome.html'
    model_type = 'None selected'
    model_desc = 'Use one of the buttons on the left to select a task.'
    table_title = ''
    img_1_title = ''
    img_2_title = ''
    output_table = {}
    img_1_pngB64 = ''
    img_2_pngB64 = ''
    img_1_fig = Figure()
    img_2_fig = Figure()
    
    # Normal button states are shown by default.
    img_clf = 'clf_normal'
    img_reg = 'reg_normal'
    img_clust = 'clust_normal'
        
    # Get selected task from the URL query string.
    selected_model = request.args.get('model')
    
    if selected_model == 'clf':
        img_clf = 'clf_selected'
        which_layout = 'demo_w_scores.html'
        model_type = 'Classification'
        model_desc = clf_desc
        table_title = 'Model Scores'
        img_1_title = 'Top 10 Most Important Features'
        img_2_title = 'Lift Chart for Term Deposit Signups'
        output_table = clf_scores
        
        # Feature importance plot.
        img_1_fig = Figure(figsize = (12, 6))
        ax = img_1_fig.add_subplot(1, 1, 1)
        ax.set_ylabel('Feature', weight = 'bold', size = 14,
                      labelpad = 50, rotation = 'horizontal')
        ax.set_xlabel('Weight', weight = 'bold', size = 14,
                      labelpad = 15)
        ax.tick_params(labelsize = 12)
        ax.barh(clf_feat_importances.index, clf_feat_importances.values)
        img_1_fig.patch.set_alpha(0)
        img_1_fig.tight_layout()
        
        # Lift chart.
        img_2_fig = Figure(figsize = (10, 6))
        ax = img_2_fig.add_subplot(1, 1, 1)
        skplt.metrics.plot_lift_curve(clf_pred, clf_probas, ax = ax)
        ax.set_title('')
        ax.set_xlabel('Percentage of Customers', weight = 'bold',
                      size = 14, labelpad = 15)
        ax.set_ylabel('Lift', weight = 'bold', size = 14,
                      labelpad = 30, rotation = 'horizontal')
        ax.tick_params(labelsize = 12)
        ax.legend(loc = 'upper right')
        img_2_fig.patch.set_alpha(0)
        img_2_fig.tight_layout()
        
    elif selected_model == 'reg':
        img_reg = 'reg_selected'
        which_layout = 'demo.html'
        model_type = 'Regression'
        model_desc = reg_desc
        img_1_title = 'Top 5 Most Important Features'
        img_2_title = 'Total Balance Predictions vs. Actual Values'
        
        # Feature importance plot.
        img_1_fig = Figure(figsize = (12, 6))
        ax = img_1_fig.add_subplot(1, 1, 1)
        ax.set_ylabel('Feature', weight = 'bold', size = 14,
                      labelpad = 70, rotation = 'horizontal')
        ax.set_xlabel('Weight', weight = 'bold',
                      size = 14, labelpad = 15)
        ax.tick_params(labelsize = 12)
        ax.barh(reg_feat_importances.index, reg_feat_importances.values)
        img_1_fig.patch.set_alpha(0)
        img_1_fig.tight_layout()

        # Residuals plot.
        img_2_fig = Figure(figsize = (12, 8))
        ax = img_2_fig.add_subplot(1, 1, 1)
        ax.set_ylabel('Total Amount\n(USD)', weight = 'bold', size = 14,
                      labelpad = 60, rotation = 'horizontal')
        ax.set_xlabel('Customer Record Number', weight = 'bold',
                      size = 14, labelpad = 15)
        ax.tick_params(labelsize = 12)
        ax.plot(resid_df['record_num'],
                resid_df['total_amount_usd'],
                color = '#00A0DD',
                linewidth = 5)
        ax.scatter(resid_df['record_num'],
                   resid_df['total_pred'])
        ax.legend(['Actual'],
                  loc = 'lower center',
                  ncol = 2,
                  title = 'Dots are predicted values.')
        img_2_fig.patch.set_alpha(0)
        img_2_fig.tight_layout()
        
    elif selected_model == 'clust':
        img_clust = 'clust_selected'
        which_layout = 'demo.html'
        model_type = 'Clustering'
        model_desc = clust_desc
        img_1_title = 'Percentage of Customers in Each Cluster'
        img_2_title = 'Age Distribution for Each Customer Segment'
        
        # Bar chart of cluster frequencies.
        img_1_fig = Figure(figsize = (12, 8))
        ax = img_1_fig.add_subplot(1, 1, 1)
        ax.set_xlabel('Cluster ID', weight = 'bold',
                      size = 14,labelpad = 15)
        ax.set_ylabel('Customers\n(%)', weight = 'bold', size = 14,
                      labelpad = 60, rotation = 'horizontal')
        ax.tick_params(labelsize = 12)
        ax.bar(list(clust_freqs.keys()), list(clust_freqs.values()))
        img_1_fig.patch.set_alpha(0)
        img_1_fig.tight_layout()
        
        # Bar chart subplots of age groups for each cluster.
        img_2_fig = Figure(figsize = (22, 20))
        ax = img_2_fig.subplots(4, 3)
        
        for count, ax_i in enumerate(img_2_fig.axes):
            sns.countplot(x = clust_results[clust_results.cluster == count]['age_group_decoded'],
                          ax = ax_i, order = list(age_group_decoded.values()))
            ax_i.set_title('Cluster ' + str(count), size = 18,
                           weight = 'bold', color = '#1C3863')
            ax_i.set_xlabel('Age Group', weight = 'bold', size = 13, labelpad = 10)
            ax_i.set_ylabel('Count', weight = 'bold', size = 13, labelpad = 10)
            ax_i.tick_params(labelsize = 11)
            
        img_2_fig.patch.set_alpha(0)
        img_2_fig.tight_layout()
        
    # Convert plots to PNG.
    img_1_png = io.BytesIO()
    FigureCanvas(img_1_fig).print_png(img_1_png)
    
    img_2_png = io.BytesIO()
    FigureCanvas(img_2_fig).print_png(img_2_png)
        
    # Encode PNGs to Base64.
    img_1_pngB64 = 'data:image/png;base64,'
    img_1_pngB64 += base64.b64encode(img_1_png.getvalue()).decode('utf-8')
    
    img_2_pngB64 = 'data:image/png;base64,'
    img_2_pngB64 += base64.b64encode(img_2_png.getvalue()).decode('utf-8')
     
    # Render HTML templates with params.
    return render_template(which_layout,
                           model_type = model_type,
                           model_desc = model_desc,
                           table_title = table_title,
                           output_table = output_table,
                           img_clf = img_clf,
                           img_reg = img_reg,
                           img_clust = img_clust,
                           img_1_title = img_1_title,
                           img_2_title = img_2_title,
                           img_1 = img_1_pngB64,
                           img_2 = img_2_pngB64)

# Run the web app

In [None]:
if __name__ == "__main__":
    app.run()