In [1]:
import sys
import os

%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns

In [11]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import cross_validate

from sklearn.pipeline import make_pipeline

from sklearn import metrics

In [87]:
import mlflow
local_tracking_uri = os.path.abspath('../' + 'mlruns')
mlflow.set_tracking_uri(local_tracking_uri)

mlflow_client = mlflow.tracking.MlflowClient()

In [2]:
# Custom modules

import sys
import os
ROOT_DIR = os.path.abspath('../')
sys.path.insert(0, ROOT_DIR + "/src")

# Approach

- Based on EDA results from notebook `01_eda`, the baseline we're going to use is 55% accuracy by predicting all customers being female.
- Given this dataset and a timeline of a few days, a reasonable goal is to build a classifier with 70% accuracy.

# Constants

In [104]:
TRAIN_INPUT_PATH = '../data/process/train_feature.csv'
TEST_INPUT_PATH = '../data/process/test_feature.csv'
LABEL_COL = 'gender_first'

In [105]:
SCORING = ['roc_auc', 'f1_micro', 'f1_weighted', 'precision', 'recall']

# Load input

In [106]:
train_df = pd.read_csv(TRAIN_INPUT_PATH)

In [107]:
test_df = pd.read_csv(TEST_INPUT_PATH)

In [108]:
train_df

Unnamed: 0,step_count,step_mean,step_median,age_first,merchant_nunique,category_nunique,amount_mean,amount_median,amount_max,amount_min,...,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty
0,131,107.786260,106.0,5,14,10,35.091908,26.14,323.64,0.80,...,1,1,1,0,2,1,2,114,0,2
1,109,124.532110,134.0,4,22,11,90.336239,35.13,3902.93,1.37,...,2,2,2,0,0,3,0,65,1,17
2,94,51.436170,49.5,4,10,7,38.154894,28.32,326.34,0.99,...,0,1,1,0,1,0,0,80,0,4
3,30,91.833333,94.5,1,11,6,204.754667,103.68,1260.94,1.05,...,1,0,0,0,0,5,1,0,0,6
4,131,107.702290,109.0,3,17,12,34.253282,29.94,197.30,0.59,...,1,0,4,0,1,2,1,108,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3275,175,89.891429,91.0,3,14,9,32.555771,26.63,241.58,0.17,...,1,0,1,0,0,0,4,153,0,4
3276,171,89.005848,89.0,2,11,10,61.211637,26.22,5527.37,0.41,...,0,0,2,0,0,0,1,155,2,1
3277,177,90.107345,91.0,2,8,6,32.273107,26.20,219.99,0.54,...,0,0,0,0,0,0,0,150,0,1
3278,142,104.992958,107.5,1,16,8,36.018944,26.00,476.43,0.12,...,1,0,4,0,0,1,0,109,0,16


# Generate X, y

In [109]:
le = LabelEncoder()

In [110]:
X_train, y_train = train_df.drop(columns=[LABEL_COL]), le.fit_transform(train_df[LABEL_COL])
X_test, y_test = test_df.drop(columns=[LABEL_COL]), le.transform(test_df[LABEL_COL])

# Transform

In [153]:
scaler = PowerTransformer()

# Fit

In [154]:
clf_params = {'max_iter': 2000}

In [155]:
clf = LogisticRegression(**clf_params)

In [156]:
pipeline = make_pipeline(scaler, clf)

In [157]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('powertransformer', PowerTransformer()),
                ('logisticregression', LogisticRegression(max_iter=2000))])

# Evaluate

## Cross Validation

In [158]:
train_score = cross_validate(pipeline, X_train, y_train, scoring=SCORING,
                             cv=5, return_train_score=True)

In [159]:
train_score_df = pd.DataFrame(train_score)
train_score_df

Unnamed: 0,fit_time,score_time,test_roc_auc,train_roc_auc,test_f1_micro,train_f1_micro,test_f1_weighted,train_f1_weighted,test_precision,train_precision,test_recall,train_recall
0,0.310314,0.020936,0.533074,0.59683,0.541159,0.569741,0.52101,0.543053,0.481675,0.534918,0.312925,0.306122
1,0.360924,0.020022,0.511134,0.603111,0.522866,0.587652,0.503922,0.56288,0.451777,0.568915,0.302721,0.329932
2,0.398944,0.021783,0.520944,0.597163,0.53811,0.58003,0.508953,0.5548,0.473054,0.554252,0.268707,0.321429
3,0.371799,0.022069,0.487879,0.613511,0.515244,0.585747,0.492895,0.566878,0.43617,0.55894,0.278912,0.358844
4,0.361261,0.022066,0.489185,0.60591,0.516768,0.586128,0.490363,0.562143,0.435028,0.565029,0.261905,0.332483


In [160]:
train_score_agg_dict = train_score_df.mean(axis=0).to_dict()

In [161]:
train_score_agg_dict

{'fit_time': 0.36064844131469725,
 'score_time': 0.021375179290771484,
 'test_roc_auc': 0.5084432668094863,
 'train_roc_auc': 0.603305051302289,
 'test_f1_micro': 0.5268292682926828,
 'train_f1_micro': 0.5818597560975609,
 'test_f1_weighted': 0.5034283916458067,
 'train_f1_weighted': 0.5579507019468983,
 'test_precision': 0.4555408791970894,
 'train_precision': 0.5564109461769521,
 'test_recall': 0.2850340136054422,
 'train_recall': 0.32976190476190476}

## Holdout

In [162]:
holdout_score_agg_dict = dict()
for scorer_name in SCORING:
    scorer = metrics.get_scorer(scorer_name)
    score = scorer(pipeline, X_test, y_test)
    holdout_score_agg_dict[f"holdout_{scorer_name}"] = score    

In [164]:
holdout_score_agg_dict

{'holdout_roc_auc': 0.5051617467207021,
 'holdout_f1_micro': 0.5341463414634147,
 'holdout_f1_weighted': 0.5001563630983088,
 'holdout_precision': 0.47959183673469385,
 'holdout_recall': 0.25133689839572193}

### Store run info in MLflow

In [165]:
with mlflow.start_run():
    mlflow.log_param('data', 'fe2')
    mlflow.log_param('num_tf', 'PowerTransformer')
    mlflow.log_param('clf', 'LogisticRegression')
    mlflow.log_params(clf_params)
    mlflow.log_metrics({**train_score_agg_dict, **holdout_score_agg_dict})
    mlflow.set_tag("issue", "9")

# Archive

In [52]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

# MLflow

In [167]:
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.holdout_precision,metrics.train_precision,metrics.holdout_f1_weighted,metrics.score_time,...,params.clf,params.data,params.max_iter,params.num_tf,params.cv,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.note.content,tags.eval_set
0,cfe84ff3ee5341cc889f79ba61d8acda,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 17:34:04.895000+00:00,2020-07-14 17:34:05.056000+00:00,0.479592,0.556411,0.500156,0.021375,...,LogisticRegression,fe2,2000.0,PowerTransformer,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
1,a1448fafcd924ea385ed2b33e2717a04,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 17:28:26.927000+00:00,2020-07-14 17:28:27.089000+00:00,0.460432,0.544958,0.471899,0.009517,...,LogisticRegression,fe2,2000.0,StandardScaler,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
2,41b94130e16048d88fa703a44f39ff8f,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 16:26:50.776000+00:00,2020-07-14 16:26:50.840000+00:00,,,,,...,,,,,,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,,
3,2b054f5f9ec7457fbbdcc66fb1212b7e,0,FINISHED,/home/quydv1/frostmourne/lich/banksim/mlruns/0...,2020-07-14 16:06:36.456000+00:00,2020-07-14 16:06:36.604000+00:00,,0.2,,0.012698,...,LogisticRegressionCV,fe1,1000.0,PowerTransformer,5.0,/home/quydv1/anaconda3/envs/banksim/lib/python...,LOCAL,quydv1,Ignore results due to a bug at https://github....,train


In [168]:
run_obj = mlflow_client.get_run('cfe84ff3ee5341cc889f79ba61d8acda')

In [169]:
mlflow_client.set_tag(run_obj.info.run_id, "issue", "9")

In [None]:
run_obj.data