In [15]:
from __future__ import division
import utils
import pandas as pd
import numpy as np
import math
import datetime as dt

# from sklearn.preprocessing import MinMaxScaler
# from sklearn.feature_selection import VarianceThreshold
# from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import  train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline 
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, f1_score, confusion_matrix, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import joblib
# import lightgbm as lgb

from lightgbm import LGBMClassifier

from matplotlib import pyplot as plt
%matplotlib inline

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.options.display.float_format = "{:.2f}".format
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199
# np.set_printoptions(threshold=np.nan)  

In [2]:
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s-%(levelname)s-%(name)s: %(message)s", datefmt='%d-%b-%y %H:%M:%S')

logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [3]:
thresholds=[0.93,0.5]

all_cols=['affiliate',
         'channelcode',
         'local_trans_amt',
         'trans_currency',
         'nonmor',
         'payment_method',
         'eci',
         'card_cvvmatch',
         'card_avsmatch',
         'customer_title',
         'website_language',
         'brand_continent',
         'HasTicketInsurance',
         'HasInsurance',
         'HasBaggage',
         'HasBaggage_Sale',
         'HasAirhelpPlus',
         'UsedVoucher',
         'HasSP',
         'gender',
         'BookerCity',
         'BookerCountry_Name',
         'TotalAmountEUR',
         'airline',
         'Consolidator',
         'OriginAirportCode',
         'OriginCityName',
         'OriginCountryName',
         'OriginWorldPartName',
         'OriginRegion',
         'OriginSubRegion',
         'DestinationAirportCode',
         'DestinationCityName',
         'DestinationCountryName',
         'DestinationWorldPartName',
         'DestinationRegion',
         'DestinationSubRegion',
         'Supplier',
         'DomesticOrInternational',
         'Haul',
         'OneWayOrReturn',
         'IsCombinedOneWay',
         'IsITFare',
         'TotalNumberOfPassengers',
         'TotalNumberOfAdults',
         'TotalNumberOfChildren',
         'TotalNumberOfInfants',
         'COW_NumberOfSegments',
         'COW_NumberOfInboundSegments',
         'COW_NumberOfOutboundSegments',
         'NumberOfTickets',
         'FareBaseAmount',
         'HasSplitPnr',
         'DeviceType',
         'IsLowCost',
         'email_user_id_len',
         'email_domain',
         'dob_month',
         'bookerIsTravelAgency',
         'successful_attempt_no',
         'country_ip_flight_MatchScore',
         'flight_distance',
         'days_to_departure',
         'vacation_length',
         'cities_in_itinerary',
         'cabin_class',
         'name_len',
         'weekend_booking',
         'booking_daytime',
         'bookerAgeBracket',
         'email_sanity_score',
         'OrderDateBrandTime_DayOfWeek',
         'ordermonth_sin',
         'ordermonth_cos',
         'departuremonth_sin',
         'departuremonth_cos']

LABEL='target'

categorical_features_pos=[0,
                         1,
                         3,
                         4,
                         5,
                         6,
                         7,
                         8,
                         9,
                         10,
                         11,
                         12,
                         13,
                         14,
                         15,
                         16,
                         17,
                         18,
                         19,
                         20,
                         21,
                         23,
                         24,
                         25,
                         26,
                         27,
                         28,
                         29,
                         30,
                         31,
                         32,
                         33,
                         34,
                         35,
                         36,
                         37,
                         38,
                         39,
                         40,
                         41,
                         42,
                         52,
                         53,
                         54,
                         56,
                         65,
                         58,
                         67,
                         57,
                         68,
                         69,
                         71]

In [30]:
class FraudModel:
    def __init__(self):
        self.clf=None
        self.cat_feat_pos=categorical_features_pos
        
    def __repr__(self):
        return ('This is an instance of class Fraud Model. It has train, predict, serialize and deserialize methods.')
    
    def train(self, in_df, samples_each_class=None, valid_pct=0.33, test_threshold=0.7, fname=''):
        """Function to train the model with new data
        
        Parameters:
        -----------
        in_df (object): Preprocessed data to train the model on
        samples_each_class(int): Optional. After treating class imbalance, the number of samples to be present in each class. 
                                 Default 80000 but if good booking samples<80000 then 0.9*number of good samples.
        valid_pct(float): (str): Percent split for validation data. Default 0.33.
        test_threshold (float): Prediction threshold to report accuracy metrics of the newly trained model. Default 0.7.
        fname (str): Path of a newly trained model with filename.pkl at the end. If not passed, the model will not be saved.

        Returns:
        --------------
        
        Raises:
        --------------
        AssertionError: When label does not have 2 classes
        AssertionError: When sampling does not return the number of samples as specified in samples_each_class
        
        Function call:
        --------------
        model=FraudModel()
        model.train(df,'/home/gjain/lightgbm_v1.1.pkl')
        
        """
            
        logger.info("Run this only if you have the chargeback data for this time period available")
        
        try:
            assert in_df[LABEL].nunique()==2, "No chargeback orders available. Cancelling training"
        except AssertionError as e:
            logger.error(e)
            raise
        
        logger.info('Treating class imbalance')
        
        if samples_each_class==None:
            if in_df[LABEL].value_counts()[0]<80000:
                samples_each_class=int(in_df[LABEL].value_counts()[0]*0.9)
            else:
                samples_each_class=80000        
        else: 
            try: 
                assert samples_each_class<=in_df[LABEL].value_counts()[0], f"samples_each_class entered by you is more the number of good booking samples. Provide a lower value <= {in_df[LABEL].value_counts()[0]} or leave it blank for automatic handling."
            except AssertionError as e:
                logger.error(e)        
                raise
            else:
                samples_each_class=samples_each_class
        
        #Under-sampling
        sampler = RandomUnderSampler({1: in_df[LABEL].value_counts()[1], 0: samples_each_class}, random_state=16)
        X, y = sampler.fit_sample(in_df[all_cols], in_df[LABEL])
        X=pd.DataFrame(X, columns=[all_cols])        
        
        #Over-sampling
        osampler=SMOTENC(categorical_features_pos, {1: samples_each_class, 0: samples_each_class}, random_state=16)
        X1, y = osampler.fit_resample(X, y)
        X=pd.DataFrame(X1, columns=[all_cols])
        X.shape
        
        try:
            assert X.shape[0]==samples_each_class*2, "Sampling failed. Cancelling training. Try changing samples_each_class."
        except AssertionError as e:
            logger.error(e)
            raise
            
        #Training
        train=X.as_matrix()
        target=np.array(y)
        
        train, X_valid, target, y_valid = train_test_split(train, target, test_size=valid_pct, random_state=42, stratify=y)

        logger.info('Training the model now')
        
        params={'colsample_bytree': 0.8423601710817756,
                 'importance_type': 'gain',
                 'learning_rate': 0.18240968736758725,
                 'max_depth': 1,
                 'min_child_samples': 553,
                 'n_estimators': 800,
                 'num_leaves': 144,
                 'objective': 'binary',
                 'random_state': 99,
                 'reg_alpha': 0.8922237374107099,
                 'reg_lambda': 0.1315059745283662,
                 'scale_pos_weight': 1.0,
                 'subsample': 0.6046368708468386}

        self.clf=LGBMClassifier(**params)

        self.clf.fit(train, target, eval_set=[(X_valid,y_valid)], eval_metric='f1_score', early_stopping_rounds=25,
                        verbose=True, categorical_feature=categorical_features_pos)

        #Class-wise accuracy of the new model        
        predictions=pd.DataFrame(self.clf.predict_proba(X_valid))
        y_pred=predictions.iloc[:,1].apply(lambda x: 1 if x>=test_threshold else 0 )

        f=f1_score(y_true=y_valid, y_pred=y_pred, average=None)
        p=precision_score(y_true=y_valid, y_pred=y_pred, average=None)
        r=recall_score(y_true=y_valid, y_pred=y_pred, average=None)        
        logger.info('Validation data scores:')
        logger.info(f'\n Fraud booking \n --------- \n F-score: {f[1]*100:.2f}%   Precision: {p[1]*100:.2f}%   Recall: {r[1]*100:.2f}%')
        logger.info(f'\n Good booking \n --------- \n F-score: {f[0]*100:.2f}%   Precision: {p[0]*100:.2f}%   Recall: {r[0]*100:.2f}%')        
        
        logger.info('Training data scores:')
        predictions_train=pd.DataFrame(self.clf.predict_proba(train))
        y_pred_train=predictions_train.iloc[:,1].apply(lambda x: 1 if x>=test_threshold else 0 )
        ft=f1_score(y_true=target, y_pred=y_pred_train, average=None)
        pt=precision_score(y_true=target, y_pred=y_pred_train, average=None)
        rt=recall_score(y_true=target, y_pred=y_pred_train, average=None)
        logger.info(f'\n Fraud booking \n --------- \n F-score: {ft[1]*100:.2f}%   Precision: {pt[1]*100:.2f}%   Recall: {rt[1]*100:.2f}%')
        logger.info(f'\n Good booking \n --------- \n F-score: {ft[0]*100:.2f}%   Precision: {pt[0]*100:.2f}%   Recall: {rt[0]*100:.2f}%')        
        
        logger.info('Training complete')
            
        #Serialize
        if fname!='':
            joblib.dump(self.clf, fname, compress=3)
            logger.info(f'Storing the model to file {fname}')
        else: 
            logger.warn('Storage path not provided for the newly trained model. If you still want to store the model, call the serialize function.')            
      
        
    def predict(self, in_df):
        X=in_df[all_cols].as_matrix()

        predictions=pd.DataFrame(self.clf.predict_proba(X))
        y_pred=predictions.iloc[:,1].apply(lambda x: 'BLOCK' if x>=thresholds[0] 
                                           else ('High Risk' if x>=thresholds[1] 
                                           else 'Accept') )
        
        return y_pred
    
    def serialize(self, fname):
        """Function to save a newly trained model
        
        Parameters:
        -----------
        fname (str): Path of a newly trained model with filename.pkl at the end        

        Function call:
        --------------
        trained_model.serialize('/home/gjain/lightgbmv1.pkl')
        
        """
        joblib.dump(self.clf, fname, compress=3)
        logger.info(f'Storing the model to file {fname}')
    
    @staticmethod
    def deserialize(fname):
        """Function to load a trained model
        
        Parameters:
        -----------
        fname (str): Path of a pickled trained model

        Returns:
        -----------
        Instance object of class FraudModel: This will be an already trained model

        Function call:
        --------------
        loaded_model=FraudModel.deserialize('/home/gjain/lightgbmv1.pkl')
        loaded_model.predict(df)
        """
        
        model=FraudModel()
        model.clf=joblib.load(fname) 
        logger.info(f'Model loaded from file {fname}')
        return model        