v2

In [1]:
import sys
sys.path.append('../../')

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from modules import machine_learning_utils as mlu

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

# 1.0 Data retrieval

In [3]:
df = pd.read_pickle("../../data_lake/output/train_eng.pkl")
val = pd.read_pickle("../../data_lake/output/val_eng.pkl")

In [4]:
cols = ['card1', 'card2', 'card3','card5','M4', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25',
       'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_32', 'id_33', 'id_34',
       'id_35', 'id_36', 'id_37', 'id_38','num_transaction_per_time']

In [5]:
df = df.drop(columns=cols)

In [6]:
val = val.drop(columns=cols)

# 2.0 Data Preparation

In [7]:
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import beta

In [8]:
scaler = MinMaxScaler()

In [9]:
df = df[['isFraud','max_c','max_d','browser_enc','device_info_v4']]

# 3.0 Modeling

In [10]:
def log_beta_scaling(df):
    """
    
    """
    for i in df:
        a = 9
        b = 1
        if df[i].dtypes == 'float64':
            df[i] = np.log1p(df[i])
            df[i] = scaler.fit_transform(df[i].values.reshape(-1,1))
            df[i] = beta.ppf(df[i], a, b)
    return df, scaler

In [11]:
def warning_score(df):
    """
    
    """
    df['warning_score'] = 0
    w = [0.8,0.2]
    df['warning_score'] = np.dot(df[['max_c','max_d']],w)
    df['warning_score'] = df['warning_score'].mask(df['device_info_v4'] == 'other',df['warning_score']+0.05)
    df['warning_score'] = df['warning_score'].mask(df['browser_enc'] == 'other',df['warning_score']+0.1)
    df['warning_score'] = df['warning_score'].mask(df['warning_score']>=1,0.95)
    return df

In [12]:
df, scaler = log_beta_scaling(df)

In [13]:
df = warning_score(df)

In [8]:
X_train = df.drop(columns='isFraud')
y_train = df['isFraud'].astype(float)

In [9]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns

In [10]:
parameters = {'model__max_depth':[6,8],'model__n_estimators':[500,800],'model__learning_rate':[0.05,0.1],'model__min_samples_leaf':[3,5]}

In [11]:
model = mlu.model_training(X_train,y_train,numerical_cols,categorical_cols,parameters)

GridSearchCV results...
Mean Train Scores: 
[0.87915758 0.89078492 0.8819401  0.90386593 0.92404796 0.92849121
 0.91881546 0.92288487 0.9127528  0.9127528  0.90262    0.90262
 0.94310833 0.94310833 0.92666503 0.92666503]

Mean CV Scores: 
[0.8275819  0.83032306 0.82558813 0.83480718 0.8340601  0.83480755
 0.83405985 0.8338106  0.83505655 0.83505655 0.83244046 0.83244046
 0.84128581 0.84128581 0.83667489 0.83667489]

Best Parameters: 
{'model__learning_rate': 0.1, 'model__max_depth': 8, 'model__min_samples_leaf': 3, 'model__n_estimators': 500}



# 4.0 Model Output

In [14]:
with open('../../data_lake/output/log_scaler_bi.pkl','wb') as file:
    pickle.dump(scaler, file)