In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pyspark.sql.functions as F
import pyod
import sys
sys.path.insert(0, 'models/')

import balance as bal
import cat_boost as cat
import isolation_forest
import parameters as param
import preprocessing as preprocess
import visualization as vis
import statistics as stats
import io_module as io

# This param when defined as True will execute the complete code, so slowly processing time
# because is require to execute all checks and print all not essential functions.
# When defined as False, a fast processing is applied with all core functionalities working
# well.
full_execution = False
verbose_mode = True

all_features = ['ProductCategory', 'ChannelId', 'Value', 'PricingStrategy', 'Operation', 'PositiveAmount',
                'avg_ps_ChannelId', 'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                'avg_ps_ProductId', 'rt_avg_ps_ProductId', 'Hour', 'DayOfWeek', 'DayOfYear', 'WeekOfYear', 'Month',
                               'Ps_per_dayWk', 'Ps_per_dayYr', 'Op_x_value']
columns_to_remove = ['CurrencyCode', 'CountryCode', 'BatchId', 'AccountId', 'SubscriptionId', 'ProviderId',
                     'CustomerId', 'TransactionStartTime', 'Amount', 'ProductId']
categorical_features = ['ProductId', 'ProductCategory', 'ChannelId']
numerical_features = ['PositiveAmount', 'Operation', 'Value', 'PricingStrategy']
numerical_features_augmented = ['Value', 'PricingStrategy', 'Operation', 'PositiveAmount', 'avg_ps_ChannelId',
                                'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                                'avg_ps_ProductId', 'rt_avg_ps_ProductId', 'Hour', 'DayOfWeek', 'DayOfYear',
                                'WeekOfYear', 'Month', 'Ps_per_dayWk', 'Ps_per_dayYr', 'Op_x_value']

label = 'FraudResult'
genuine_label = 'FraudResult==0'
fraud_label = 'FraudResult==1'
categorical_positions = [0, 1, 2, 4];

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Read Fraud Detection Challenge data
train_data = io.read_spark_data_frame(param.get_file_name('training_data'))
train_data = preprocess.get_features_augmentation(train_data)
train_data = train_data.drop(*columns_to_remove)

In [3]:
contamination = (train_data.filter(fraud_label).count())/(train_data.count())

In [4]:
x_train = train_data.toPandas()[all_features]
y_train = train_data[[label]].toPandas()

In [5]:
import isolation_forest as iso
model = iso.IsolationForest()
model.fit_grid_search(x_train[numerical_features_augmented], y_train, \
                      [100, 300, 2500], [contamination, 0.01, 0.03]);

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sor

### PyOD models

In [7]:
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.sos import SOS
from pyod.models.lscp import LSCP
from pyod.models.cof import COF
from pyod.models.sod import SOD

In [14]:
clf = HBOS(contamination=contamination)
clf.fit(x_train[numerical_features_augmented]);

In [15]:
np.sum(clf.predict(x_train[numerical_features_augmented]))

189

### Test Experiments

In [17]:
# Read Fraud Detection Challenge data
test_data = io.read_spark_data_frame(param.get_file_name('testing_data'))

In [25]:
transactions_list = test_data.select(['TransactionId']).toPandas()

In [18]:
test_data = preprocess.get_features_augmentation(test_data)
test_data = train_data.drop(*columns_to_remove)

In [19]:
x_test_data = test_data[numerical_features_augmented].toPandas()

In [21]:
y_test_pred = clf.predict(x_test_data)

In [26]:
io.save_predictions_xente('../data/predictions_000.txt', transactions_list, y_test_pred)