In [1]:
import sys
import os

%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns

In [2]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import cross_validate

from sklearn.pipeline import make_pipeline

from sklearn import metrics

In [3]:
import mlflow
local_tracking_uri = os.path.abspath('../' + 'mlruns')
mlflow.set_tracking_uri(local_tracking_uri)

mlflow_client = mlflow.tracking.MlflowClient()

In [4]:
# Custom modules

import sys
import os
ROOT_DIR = os.path.abspath('../')
sys.path.insert(0, ROOT_DIR + "/src")

# Approach

- Based on EDA results from notebook `01_eda`, the baseline we're going to use is 55% accuracy by predicting all customers being female.
- Given this dataset and a timeline of a few days, a reasonable goal is to build a classifier with 70% accuracy.

# Constants

In [5]:
TRAIN_INPUT_PATH = '../data/process/train_agg.csv'
TEST_INPUT_PATH = '../data/process/test_agg.csv'
LABEL_COL = 'gender_first'

In [6]:
SCORING = ['roc_auc', 'f1_micro', 'f1_weighted', 'precision', 'recall']

# Load input

In [7]:
train_df = pd.read_csv(TRAIN_INPUT_PATH)

In [8]:
test_df = pd.read_csv(TEST_INPUT_PATH)

In [9]:
train_df

Unnamed: 0,step_count,step_mean,step_median,age_first,merchant_nunique,category_nunique,amount_mean,amount_median,amount_max,amount_min,amount_std,fraud_sum,fraud_mean,gender_first
0,131,107.786260,106.0,5,14,10,35.091908,26.14,323.64,0.80,36.863536,1,0.007634,M
1,109,124.532110,134.0,4,22,11,90.336239,35.13,3902.93,1.37,381.812802,5,0.045872,M
2,94,51.436170,49.5,4,10,7,38.154894,28.32,326.34,0.99,41.544414,0,0.000000,M
3,30,91.833333,94.5,1,11,6,204.754667,103.68,1260.94,1.05,306.240251,11,0.366667,M
4,131,107.702290,109.0,3,17,12,34.253282,29.94,197.30,0.59,27.820062,0,0.000000,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3275,175,89.891429,91.0,3,14,9,32.555771,26.63,241.58,0.17,30.641718,0,0.000000,M
3276,171,89.005848,89.0,2,11,10,61.211637,26.22,5527.37,0.41,421.233339,2,0.011696,M
3277,177,90.107345,91.0,2,8,6,32.273107,26.20,219.99,0.54,30.122381,0,0.000000,F
3278,142,104.992958,107.5,1,16,8,36.018944,26.00,476.43,0.12,48.619220,2,0.014085,F


# Generate X, y

In [10]:
le = LabelEncoder()

In [11]:
X_train, y_train = train_df.drop(columns=[LABEL_COL]), le.fit_transform(train_df[LABEL_COL])
X_test, y_test = test_df.drop(columns=[LABEL_COL]), le.transform(test_df[LABEL_COL])

# Transform

In [25]:
scaler = StandardScaler()

# Fit

In [26]:
clf_params = {'max_iter': 2000}

In [27]:
clf = LogisticRegression(**clf_params)

In [28]:
pipeline = make_pipeline(scaler, clf)

In [29]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=2000))])

# Evaluate

## Cross Validation

In [30]:
train_score = cross_validate(pipeline, X_train, y_train, scoring=SCORING,
                             cv=5, return_train_score=True)

In [31]:
train_score_df = pd.DataFrame(train_score)
train_score_df

Unnamed: 0,fit_time,score_time,test_roc_auc,train_roc_auc,test_f1_micro,train_f1_micro,test_f1_weighted,train_f1_weighted,test_precision,train_precision,test_recall,train_recall
0,0.0219,0.008599,0.471145,0.54827,0.53811,0.553735,0.41276,0.435556,0.354839,0.518519,0.037415,0.059524
1,0.018854,0.008651,0.510477,0.53756,0.54878,0.55907,0.423026,0.436996,0.464286,0.582609,0.044218,0.056973
2,0.01683,0.007069,0.540957,0.530886,0.551829,0.553354,0.405622,0.409525,0.5,0.543478,0.017007,0.021259
3,0.015446,0.006794,0.503777,0.543858,0.541159,0.550305,0.446438,0.448437,0.442623,0.490196,0.091837,0.085034
4,0.021113,0.008277,0.521094,0.531984,0.54878,0.551448,0.416197,0.417611,0.454545,0.493827,0.034014,0.034014


In [32]:
train_score_agg_dict = train_score_df.mean(axis=0).to_dict()

In [33]:
train_score_agg_dict

{'fit_time': 0.018828630447387695,
 'score_time': 0.007877874374389648,
 'test_roc_auc': 0.5094899838388394,
 'train_roc_auc': 0.5385115993911377,
 'test_f1_micro': 0.5457317073170731,
 'train_f1_micro': 0.5535823170731706,
 'test_f1_weighted': 0.4208084904923418,
 'train_f1_weighted': 0.4296248164310878,
 'test_precision': 0.4432585658656521,
 'train_precision': 0.5257257427930914,
 'test_recall': 0.04489795918367347,
 'train_recall': 0.051360544217687064}

## Holdout

In [34]:
holdout_score_agg_dict = dict()
for scorer_name in SCORING:
    scorer = metrics.get_scorer(scorer_name)
    score = scorer(pipeline, X_test, y_test)
    holdout_score_agg_dict[f"holdout_{scorer_name}"] = score    

In [35]:
holdout_score_agg_dict

{'holdout_roc_auc': 0.5231769022325603,
 'holdout_f1_micro': 0.5329268292682927,
 'holdout_f1_weighted': 0.39239627148870937,
 'holdout_precision': 0.30434782608695654,
 'holdout_recall': 0.01871657754010695}

### Store run info in MLflow

In [36]:
with mlflow.start_run():
    mlflow.log_param('data', 'fe1')
    mlflow.log_param('num_tf', 'StandardScaler')
    mlflow.log_param('clf', 'LogisticRegression')
    mlflow.log_params(clf_params)
    mlflow.log_metrics({**train_score_agg_dict, **holdout_score_agg_dict})
    mlflow.set_tag("issue", "7")

# Archive

In [52]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

# MLflow

In [167]:
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.holdout_precision,metrics.train_precision,metrics.holdout_f1_weighted,metrics.score_time,...,params.clf,params.data,params.max_iter,params.num_tf,params.cv,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.note.content,tags.eval_set
0,cfe84ff3ee5341cc889f79ba61d8acda,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 17:34:04.895000+00:00,2020-07-14 17:34:05.056000+00:00,0.479592,0.556411,0.500156,0.021375,...,LogisticRegression,fe2,2000.0,PowerTransformer,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
1,a1448fafcd924ea385ed2b33e2717a04,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 17:28:26.927000+00:00,2020-07-14 17:28:27.089000+00:00,0.460432,0.544958,0.471899,0.009517,...,LogisticRegression,fe2,2000.0,StandardScaler,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
2,41b94130e16048d88fa703a44f39ff8f,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 16:26:50.776000+00:00,2020-07-14 16:26:50.840000+00:00,,,,,...,,,,,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
3,2b054f5f9ec7457fbbdcc66fb1212b7e,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 16:06:36.456000+00:00,2020-07-14 16:06:36.604000+00:00,,0.2,,0.012698,...,LogisticRegressionCV,fe1,1000.0,PowerTransformer,5.0,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,Ignore results due to a bug at https://github....,train


In [168]:
run_obj = mlflow_client.get_run('cfe84ff3ee5341cc889f79ba61d8acda')

In [169]:
mlflow_client.set_tag(run_obj.info.run_id, "issue", "9")

In [None]:
run_obj.data