# Evaluation Notebook

## Model Evaluation

In this stage you will test your model against previously unseen data and see how it compares to other models, which have been tested on the same set.

In [1]:
import os
import sys
import pandas as pd
import joblib
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, PowerTransformer
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [2]:
src_path = os.path.abspath('../src/')
sys.path.append(src_path)

from ed_data_modeling import *

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_names=None, col_nums=None):
        self.col_names=col_names
        self.col_nums=col_nums
        self.use = None
        assert (self.col_names is not None) or (self.col_nums is not None), 'Must set either col_names or col_nums'
        
    def fit(self, X, y=None):
        if self.col_names is not None:
            self.use = 'col_names'
        elif self.col_nums is not None:
            self.use = 'col_nums'
        return self
    
    def transform(self, X, y=None):
        if self.use == 'col_names':
            _X = X[self.col_names].copy()
        elif self.use == 'col_nums':
            _X = X[:, self.col_nums]
        return(_X)

In [4]:
dtype_dict = {
    'c3':'object',
    'v1':'object',
    'v2':'object',
    'v3':'object',
    'v4':'object',
    'v5':'object',
    'v6':'object',
    'v7':'object',
    'v8':'object',
    'v9':'object',
    'v10':'object',
    'v11':'object',
    'v12':'object',
    'v13':'object',
    'v14':'object',
    'v15':'object',
    'v17':'object',
    'v18':'object',
    'v19':'object',
    'v21':'object',
    'v22':'object',
    'v23':'object',
    'v24':'object',
    'v29':'object',
    'v31':'object',
    'v34':'object',
    'v35':'object',
    'v36':'object',
    'v39':'object',
    'v41':'object',
    'v42':'object',
    'v43':'object',
    'v44':'object',
    'v45':'object',
    'v46':'object',
    'v47':'object',
    'v48':'object',
    'v49':'object',
    'v50':'object',
    'v51':'object',
    'v52':'object',
    'v53':'object',
    'v54':'object',
    'v55':'object',
    'v57':'object',
    'v58':'object',
    'v59':'object',
    'v60':'object',
    'v61':'object',
    'v62':'object',
    'v63':'object',
    'v64':'object',
    'v65':'object',
    'v66':'object',
    'v67':'object',
    'v68':'object',
    'v69':'object',
    'v70':'object',
    'v71':'object',
    'v72':'object',
    'v73':'object',
    'v74':'object',
    'v75':'object',
    'v76':'object',
    'v77':'object',
    'v78':'object',
    'v79':'object',
    'v80':'object',
    'v81':'object',
    'v82':'object',
    'v83':'object',
    'v84':'object',
    'v85':'object',
    'v86':'object',
    'v87':'object',
    'v88':'object',
    'v89':'object',
    'v90':'object',
    'v91':'object',
    'v92':'object',
    'v93':'object',
    'v94':'object',
    'v98':'object',
    'v100':'object',
    'v104':'object',
    'v107':'object',
    'v108':'object',
    'v109':'object',
    'v110':'object',
    'v111':'object',
    'v112':'object',
    'v113':'object',
    'v114':'object',
    'v115':'object',
    'v116':'object',
    'v117':'object',
    'v118':'object',
    'v119':'object',
    'v120':'object',
    'v121':'object',
    'v122':'object',
    'v123':'object',
    'v124':'object',
    'v125':'object',
    'v138':'object',
    'v139':'object',
    'v140':'object',
    'v141':'object',
    'v142':'object',
    'v146':'object',
    'v147':'object',
    'v148':'object',
    'v149':'object',
    'v153':'object',
    'v154':'object',
    'v155':'object',
    'v156':'object',
    'v157':'object',
    'v158':'object',
    'v169':'object',
    'v170':'object',
    'v172':'object',
    'v173':'object',
    'v174':'object',
    'v175':'object',
    'v176':'object',
    'v181':'object',
    'v183':'object',
    'v184':'object',
    'v185':'object',
    'v186':'object',
    'v188':'object',
    'v189':'object',
    'v190':'object',
    'v191':'object',
    'v192':'object',
    'v193':'object',
    'v194':'object',
    'v195':'object',
    'v196':'object',
    'v197':'object',
    'v198':'object',
    'v199':'object',
    'v200':'object',
    'v220':'object',
    'v223':'object',
    'v235':'object',
    'v236':'object',
    'v237':'object',
    'v238':'object',
    'v239':'object',
    'v241':'object',
    'v242':'object',
    'v244':'object',
    'v246':'object',
    'v247':'object',
    'v249':'object',
    'v250':'object',
    'v251':'object',
    'v252':'object',
    'v257':'object',
    'v260':'object',
    'v262':'object',
    'v281':'object',
    'v282':'object',
    'v284':'object',
    'v286':'object',
    'v287':'object',
    'v288':'object',
    'v289':'object',
    'v297':'object',
    'v300':'object',
    'v301':'object',
    'v302':'object',
    'v303':'object',
    'v304':'object',
    'v305':'object',
    'v325':'object',
    'v326':'object',
    'v327':'object',
    'v328':'object',
    'id_32':'object'
}
    

### Evaluation Data Set

Read in the test data sets from the raw data folder (../data/raw/test_identity.csv and ../data/raw/test_transaction.csv)

In [5]:
df_test = pd.read_csv('../data/interim/df_test.csv', index_col='transactionid', dtype=dtype_dict)
df_test.head()

Unnamed: 0_level_0,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,addr1,...,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38
transactionid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


In [6]:
categorical_columns = df_test.select_dtypes(include='object').columns.to_list()
numeric_columns = [c for c in df_test.columns if c not in categorical_columns]

In [7]:
categorical_columns_ohe = [c for c in categorical_columns if df_test[c].nunique() <= 5]
categorical_columns_te = [c for c in categorical_columns if df_test[c].nunique() >= 5]

In [8]:
pipe = joblib.load('../models/best_model_pipeline.pkl')

Using the same pipeline (including any cleaning and preprocessing steps you performed on the training set), generate predictions for the samples in the test set.

In [9]:
predictions = pipe.predict_proba(df_test)[:,1]



In [10]:
submission_df = pd.DataFrame({
    'TransactionID' : df_test.index,
    'isFraud' : predictions
}).reset_index(drop=True)

Save your predictions as a .csv file. Compare your predictions with ../data/raw/sample_submission.csv to ensure you've done this correctly (your file should look the same, but have different predictions).

In [11]:
submission_df.to_csv('ed_submission.csv', index=False)

In [15]:
submission_df.shape

(506691, 2)

### Kaggle Submission

In order to have your model scored, you will need to submit your data on the kaggle competition. Though the competition is closed, it will still accept predictions and score models. So score your model, follow these steps:

1. Go to the [Kaggle Competition Page](https://www.kaggle.com/c/ieee-fraud-detection)
2. If you have a Kaggle account, sign in. If you do not have a Kaggle account, click the Register button on the top right of the page and make an account.
3. Click the "Late Submission" button beneath the competition header on the right side.
4. Upload the .csv file that you generated in the above steps.
5. Click "Make Submission" and wait for your results to be evaluated.

Once these steps are complete you should have access to two things:
1. Your model's score.
2. Your model's place on the leaderboard.

Enter these two numbers below.

Private Score: 0.891836
Rank: 4666

If you want, you can repeat this process with multiple models to see how they compare to each other.

## Evaluate Results

Previous evaluation steps dealt with factors such as the accuracy and generality of the model. This step assesses the degree to which the model meets the business objectives and seeks to determine if there is some business reason why this model is deficient. 

Moreover, evaluation also assesses other data mining results generated. Data mining results involve models that are necessarily related to the original business objectives and all other findings that are not necessarily related to the original business objectives, but might also unveil additional challenges, information, or hints for future directions.

### Outputs

<b>1. Assessment of data mining results with respect to business success criteria</b>
- Summarize assessment results in terms of business success criteria (as established during the first phase), including a final statement regarding whether the project already meets the initial business objectives.

The final model results met the business criteria as established during the first phase in that the model sought to maximize area under the ROC curve.

<b>2. Approved models</b>
- After assessing models with respect to business success criteria, the generated models that meet the selected criteria become the approved models.

## Review Process

At this point, the resulting models appear to be satisfactory and to satisfy business needs. It is now appropriate to do a more thorough review of the data mining engagement in order to determine if there is any important factor or task that has somehow been overlooked. This review also covers quality assurance issues—for example: Did we correctly build the model? Did we use only the attributes that we are allowed to use and that are available for future analyses?

### Outputs

<b>3. Review of Process</b>
- Summarize the process review and highlight activities that have been missed and those that should be repeated.

The following are factors that should be repeated:
    - XGBClassifier is a very good model that does not require excessive preprocessing and does not have a tendency to overfit on the dataset
    - Hyperparameter tuning was helpful in improving model performance
    - Mix categorical encoding types between target encoding and onehot encoding
    

The following are factors that could be improved for future models:
    - Feature engineering 
    - Feature selection in regard to training resources (it doesn't change model performance, but it would improve training times)
    - Ensemble models or voting classifiers should be used to balance high-performing models capturing different patterns

## Determine Next Steps

Depending on the results of the assessment and the process review, the project team decides how to proceed. The team decides whether to finish this project and move on to deployment, initiate further iterations, or set up new data mining projects. This task includes analyses of remaining resources and budget, which may influence the decisions.

### Outputs

<b>4. List of possible actions</b>
- List the potential further actions, along with the reasons for and against each option.

Potential further actions are as follows:
    - Engineer new features to improve model performance
    - Determine which variables can be dropped altogether to improve model training times
    - Use an ensemble model or voting classifier to combine the predictive power of multiple high-performing models.

<b>5. Decision</b>
- Describe the decision as to how to proceed, along with the rationale.

Ideally, I would iterate further on the model solution to improve predictive performance, however, I am out of time, so I've gone with the best model trained to this point.

## Code Base Update

### Outputs

<b>6. Update code base</b>
- Are there any updates that should be made to the existing codebase and provided functions based on the code you wrote? (i.e. a new preprocessing function or a new visualization technique)

In [None]:
# Calculates the coefficient of variation for every feature in a dataset
def coefficient_of_variation(df):
    df_cv = df.drop('isfraud',axis=1).std() / df.drop('isfraud',axis=1).mean()
    df_cv = df_cv.to_frame().T
    df_cv.index = ['Coefficient of Variation']
    return df_cv

In [None]:
# Plots a histogram for every feature in a dataset
def hist_plots(df, bins=100):
    #plot_list = []
    
    for col in df.drop('isfraud', axis=1).columns:
        #output_widget = widgets.Output()
        
        #with output_widget:
        fig, axs = plt.subplots(3, 1, figsize=(8, 12))

            # Overall histogram
        sns.histplot(df[col], bins=bins, ax=axs[0])
        max_count = max([patch.get_height() for patch in axs[0].patches])
        axs[0].set_ylim(0, max_count * 1.1)
        axs[0].set_title(f"Histogram of {col}")

            # Histogram when not fraud
        sns.histplot(df[df['isfraud'] == 0][col], bins=bins, ax=axs[1])
        max_count = max([patch.get_height() for patch in axs[1].patches])
        axs[1].set_ylim(0, max_count * 1.1)
        axs[1].set_title(f"Histogram of {col} when not fraud")

            # Histogram when is fraud
        sns.histplot(df[df['isfraud'] == 1][col], bins=bins, ax=axs[2])
        max_count = max([patch.get_height() for patch in axs[2].patches])
        axs[2].set_ylim(0, max_count * 1.1)
        axs[2].set_title(f"Histogram of {col} when is fraud")

        plt.tight_layout()
        plt.show(block=True)

In [None]:
# Plots a boxplot for every feature in a dataset
def box_plots(df):
    #plot_list = []
    
    for col in df.drop('isfraud', axis=1).columns:
        #output_widget = widgets.Output()
        
        #with output_widget:
        fig, axs = plt.subplots(3, 1, figsize=(8, 12))

        # Overall boxplot
        sns.boxplot(x=df[col], ax=axs[0])
        axs[0].set_title(f"Boxplot of {col}")

        # Boxplot when not fraud
        sns.boxplot(x=df[df['isfraud'] == 0][col], ax=axs[1])
        axs[1].set_title(f"Boxplot of {col} when not fraud")

        # Boxplot when is fraud
        sns.boxplot(x=df[df['isfraud'] == 1][col], ax=axs[2])
        axs[2].set_title(f"Boxplot of {col} when is fraud")

        plt.tight_layout()
        plt.show(block=True)

In [16]:
# Returns pearson correlations for a full dataframe, the dataframe filtered for the negative class, and the dataframe filtered for the positive class
# Plots heatmaps of each correlation
def pearson_correlation_heatmaps(df):

    df_full_p = df.drop('isfraud', axis=1).corr()
    df_not_fraud_p = df[df['isfraud']==0].drop('isfraud', axis=1).corr()
    df_fraud_p = df[df['isfraud']==1].drop('isfraud', axis=1).corr()

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    sns.heatmap(df_full_p, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[0])
    axes[0].set_title("Pearson correlation heatmap of full dataframe")

    sns.heatmap(df_not_fraud_p, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[1])
    axes[1].set_title("Pearson correlation heatmap of dataframe filtered for not fraud")

    sns.heatmap(df_fraud_p, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[2])
    axes[2].set_title("Pearson correlation heatmap of dataframe filtered for fraud")

    plt.tight_layout()
    plt.show()

    return df_full_p, df_not_fraud_p, df_fraud_p

In [None]:
# Returns spearman correlations for a full dataframe, the dataframe filtered for the negative class, and the dataframe filtered for the positive class
# Plots heatmaps of each correlation
def spearman_correlation_heatmaps(df):

    df_full_s = df.drop('isfraud', axis=1).corr(method='spearman')
    df_not_fraud_s = df[df['isfraud']==0].drop('isfraud', axis=1).corr(method='spearman')
    df_fraud_s = df[df['isfraud']==1].drop('isfraud', axis=1).corr(method='spearman')

    fig, axes = plt.subplots(3, 1, figsize=(10,15))

    sns.heatmap(df_full_s, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[0])
    axes[0].set_title("Spearman correlation heatmap of full datagram")

    sns.heatmap(df_not_fraud_s, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[1])
    axes[1].set_title("Spearman correlation heatmap of dataframe filtered for not fraud")

    sns.heatmap(df_fraud_s, annot=False, cmap="coolwarm", center=0, annot_kws={'size': 8}, ax=axes[2])
    axes[2].set_title("Spearman correlation heatmap of dataframe filtered for fraud")

    plt.tight_layout()
    plt.show()

    return df_full_s, df_not_fraud_s, df_fraud_s

- Can you suggest further improvements to functions/classes within code base?

The score_classification function in data_modeling.py should be updated to accept both predicted class labels and predicted probabilities as arguments, and pass the predicted class labels to the metrics that use class labels and the predicted probabilities to the metrics that use probabilities