### Importing libs

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pyspark.sql.functions as F

import sys
sys.path.insert(0, 'models/')

import balance as bal
import cat_boost as cat
import isolation_forest
import parameters as param
import preprocessing as preprocess
import visualization as vis
import statistics as stats
import io_module as io

# This param when defined as True will execute the complete code, so slowly processing time
# because is require to execute all checks and print all not essential functions.
# When defined as False, a fast processing is applied with all core functionalities working
# well.
full_execution = False
verbose_mode = True

label = 'FraudResult'
genuine_label = 'FraudResult==0'
fraud_label = 'FraudResult==1'
categorical_positions = [0, 1, 2, 4];

### Reading Data

In [2]:
columns_to_remove = ['Amount','TransactionStartTime','BatchId','SubscriptionId','TransactionId','AccountId','CustomerId','ProviderId','ProductCategory','CurrencyCode','CountryCode']

In [3]:
# Read Fraud Detection Challenge data
train_data = io.read_spark_data_frame(param.get_file_name('training_data'))

# Create new features and remove the non used features
train_data = preprocess.get_features_augmentation(train_data)
train_data = train_data.drop(*columns_to_remove)

In [4]:
numerical_features_augmented = ['PricingStrategy','Value','Operation', 'PositiveAmount',
                            'Hour', 'DayOfYear', 'WeekOfYear', 'Month',
                            'avg_ps_Hour','min_ps_Hour','max_ps_Hour','rt_avg_ps_Hour',
                            'avg_ps_DayOfYear','min_ps_DayOfYear','max_ps_DayOfYear','rt_avg_ps_DayOfYear',
                            'avg_ps_WeekOfYear','min_ps_WeekOfYear','max_ps_WeekOfYear','rt_avg_ps_WeekOfYear',#0-21
                            'avg_ps_Month','min_ps_Month','max_ps_Month','rt_avg_ps_Month',
                            'avg_ps_AccountId','min_ps_AccountId','max_ps_AccountId','rt_avg_ps_AccountId',
                            'avg_ps_ChannelId','min_ps_ChannelId','max_ps_ChannelId','rt_avg_ps_ChannelId',
                            'avg_ps_ProductCategory','min_ps_ProductCategory','max_ps_ProductCategory','rt_avg_ps_ProductCategory',
                            'avg_ps_ProductId','min_ps_ProductId','max_ps_ProductId','rt_avg_ps_ProductId',#41
                            'pi_Hour','pi_DayOfYear','pi_WeekOfYear','pi_Month',
                            'pi_AccountId','pi_ChannelId','pi_ProductCategory','pi_ProductId',
                            'Ps_pr_dayWeek_pr_Month','Ps_pr_dayYear_pr_WeekYear','OpCredSum_pr_Month','OpDebtSum_pr_Month']
label = 'FraudResult'

In [5]:
train_data.toPandas().columns

Index(['Month', 'WeekOfYear', 'DayOfYear', 'Hour', 'ProductId', 'ChannelId',
       'Value', 'PricingStrategy', 'FraudResult', 'Operation',
       'PositiveAmount', 'DayOfWeek', 'Ps_pr_dayWeek_pr_Month',
       'Ps_pr_dayYear_pr_WeekYear', 'OpCredSum_pr_Month', 'OpDebtSum_pr_Month',
       'avg_ps_AccountId', 'min_ps_AccountId', 'max_ps_AccountId',
       'pi_AccountId', 'rt_avg_ps_AccountId', 'avg_ps_ChannelId',
       'min_ps_ChannelId', 'max_ps_ChannelId', 'pi_ChannelId',
       'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory',
       'min_ps_ProductCategory', 'max_ps_ProductCategory',
       'pi_ProductCategory', 'rt_avg_ps_ProductCategory', 'avg_ps_ProductId',
       'min_ps_ProductId', 'max_ps_ProductId', 'pi_ProductId',
       'rt_avg_ps_ProductId', 'avg_ps_Hour', 'min_ps_Hour', 'max_ps_Hour',
       'pi_Hour', 'rt_avg_ps_Hour', 'avg_ps_DayOfYear', 'min_ps_DayOfYear',
       'max_ps_DayOfYear', 'pi_DayOfYear', 'rt_avg_ps_DayOfYear',
       'avg_ps_WeekOfYear', 'min_ps_WeekOfYear'

In [6]:
x = train_data[numerical_features_augmented].toPandas()
y = train_data.select(label).toPandas()

In [7]:
x = np.array(x)
y = np.array(y.iloc[:,0])

### Isolation Forest

In [8]:
all_features = ['ChannelId','ProductId','PricingStrategy','Value','Operation', 'PositiveAmount',
                            'Hour', 'DayOfYear', 'WeekOfYear', 'Month',
                            'avg_ps_Hour','min_ps_Hour','max_ps_Hour','rt_avg_ps_Hour',
                            'avg_ps_DayOfYear','min_ps_DayOfYear','max_ps_DayOfYear','rt_avg_ps_DayOfYear',
                            'avg_ps_WeekOfYear','min_ps_WeekOfYear','max_ps_WeekOfYear','rt_avg_ps_WeekOfYear',#0-21
                            'avg_ps_Month','min_ps_Month','max_ps_Month','rt_avg_ps_Month',
                            'avg_ps_AccountId','min_ps_AccountId','max_ps_AccountId','rt_avg_ps_AccountId',
                            'avg_ps_ChannelId','min_ps_ChannelId','max_ps_ChannelId','rt_avg_ps_ChannelId',
                            'avg_ps_ProductCategory','min_ps_ProductCategory','max_ps_ProductCategory','rt_avg_ps_ProductCategory',
                            'avg_ps_ProductId','min_ps_ProductId','max_ps_ProductId','rt_avg_ps_ProductId',#41
                            'pi_Hour','pi_DayOfYear','pi_WeekOfYear','pi_Month',
                            'pi_AccountId','pi_ChannelId','pi_ProductCategory','pi_ProductId',
               'Ps_pr_dayWeek_pr_Month','Ps_pr_dayYear_pr_WeekYear','OpCredSum_pr_Month','OpDebtSum_pr_Month']

In [9]:
categorical_features = ['ProductId', 'ProductCategory', 'ChannelId']

In [10]:
import pandas as pd
x_data, y_data = bal.balance_using_smotenc(train_data, all_features, label, categorical_positions)
#x_data = pd.read_csv('../data/x_data_smotenc.csv')
#y_data = pd.read_csv('../data/y_data_smotenc.csv')

x = np.array(x_data.iloc[:,:])
y = np.array(y_data.iloc[:,0])

In [11]:
import sklearn.ensemble as ens
isolation = ens.IsolationForest(behaviour='new', max_samples=15000,
                                         random_state=42, contamination=0.5)
isolation.fit(x[:,3:])

IsolationForest(behaviour='new', bootstrap=False, contamination=0.5,
                max_features=1.0, max_samples=15000, n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

In [12]:
isolation_pred = isolation.predict(x[:,3:])
new_x = np.column_stack([x[:,3:], isolation_pred])

### CatBoost

In [13]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(max_depth=8, iterations=300)
model.fit(new_x, y);

Learning rate set to 0.216522
0:	learn: 0.2399977	total: 94.7ms	remaining: 28.3s
1:	learn: 0.0837839	total: 140ms	remaining: 20.8s
2:	learn: 0.0352742	total: 183ms	remaining: 18.2s
3:	learn: 0.0175653	total: 244ms	remaining: 18.1s
4:	learn: 0.0111039	total: 290ms	remaining: 17.1s
5:	learn: 0.0078380	total: 331ms	remaining: 16.2s
6:	learn: 0.0068944	total: 369ms	remaining: 15.4s
7:	learn: 0.0057787	total: 413ms	remaining: 15.1s
8:	learn: 0.0051617	total: 463ms	remaining: 15s
9:	learn: 0.0048251	total: 504ms	remaining: 14.6s
10:	learn: 0.0044142	total: 543ms	remaining: 14.3s
11:	learn: 0.0039026	total: 584ms	remaining: 14s
12:	learn: 0.0035306	total: 627ms	remaining: 13.8s
13:	learn: 0.0033302	total: 664ms	remaining: 13.6s
14:	learn: 0.0029920	total: 724ms	remaining: 13.8s
15:	learn: 0.0028615	total: 781ms	remaining: 13.9s
16:	learn: 0.0027328	total: 824ms	remaining: 13.7s
17:	learn: 0.0025683	total: 867ms	remaining: 13.6s
18:	learn: 0.0024186	total: 910ms	remaining: 13.5s
19:	learn: 0.0

161:	learn: 0.0001531	total: 6.61s	remaining: 5.63s
162:	learn: 0.0001522	total: 6.65s	remaining: 5.59s
163:	learn: 0.0001511	total: 6.68s	remaining: 5.54s
164:	learn: 0.0001507	total: 6.72s	remaining: 5.5s
165:	learn: 0.0001501	total: 6.75s	remaining: 5.45s
166:	learn: 0.0001487	total: 6.79s	remaining: 5.41s
167:	learn: 0.0001479	total: 6.84s	remaining: 5.37s
168:	learn: 0.0001471	total: 6.88s	remaining: 5.33s
169:	learn: 0.0001467	total: 6.92s	remaining: 5.29s
170:	learn: 0.0001463	total: 6.95s	remaining: 5.24s
171:	learn: 0.0001459	total: 6.99s	remaining: 5.2s
172:	learn: 0.0001455	total: 7.02s	remaining: 5.16s
173:	learn: 0.0001447	total: 7.07s	remaining: 5.12s
174:	learn: 0.0001445	total: 7.1s	remaining: 5.07s
175:	learn: 0.0001439	total: 7.14s	remaining: 5.03s
176:	learn: 0.0001431	total: 7.18s	remaining: 4.99s
177:	learn: 0.0001427	total: 7.21s	remaining: 4.94s
178:	learn: 0.0001426	total: 7.25s	remaining: 4.9s
179:	learn: 0.0001422	total: 7.29s	remaining: 4.86s
180:	learn: 0.00

### Fixing feature augmentation

In [14]:
# Evaluating with test data
test_data = io.read_spark_data_frame(param.get_file_name('testing_data'))
transactions_list = [item for item in test_data.toPandas()['TransactionId']]
#test_data = np.array(test_data[all_features].toPandas())

In [15]:
test_data = preprocess.get_features_augmentation_test(test_data)
test_data = test_data.drop(*columns_to_remove)

In [16]:
test_data[numerical_features_augmented].toPandas().head(1)

Unnamed: 0,PricingStrategy,Value,Operation,PositiveAmount,Hour,DayOfYear,WeekOfYear,Month,avg_ps_Hour,min_ps_Hour,...,pi_WeekOfYear,pi_Month,pi_AccountId,pi_ChannelId,pi_ProductCategory,pi_ProductId,Ps_pr_dayWeek_pr_Month,Ps_pr_dayYear_pr_WeekYear,OpCredSum_pr_Month,OpDebtSum_pr_Month
0,2,25,-1,25.0,12,61,9,3,7106.418461,6.0,...,2.178888,2.340354,2.002797,2.159123,1.981198,1.999737,1.190476,0.045537,8921.666667,6084.666667


In [17]:
t_data = np.array(test_data[numerical_features_augmented].toPandas())

In [18]:
isolation_pred = isolation.predict(t_data[:,1:])

In [19]:
t_data = np.column_stack([t_data, isolation_pred])

In [20]:
predictions = model.predict(t_data)

In [21]:
data = (predictions+1) % 2           
io.save_predictions_xente('../data/predictions_001.txt', transactions_list, data)