### Importing libs

In [32]:
import numpy as np
from sklearn.model_selection import train_test_split
import pyspark.sql.functions as F

import sys
sys.path.insert(0, 'models/')

import balance as bal
import cat_boost as cat
import isolation_forest
import parameters as param
import preprocessing as preprocess
import visualization as vis
import statistics as stats
import io_module as io

# This param when defined as True will execute the complete code, so slowly processing time
# because is require to execute all checks and print all not essential functions.
# When defined as False, a fast processing is applied with all core functionalities working
# well.
full_execution = False
verbose_mode = True

all_features = ['ProductId', 'ProductCategory', 'ChannelId', 'Value', 'PricingStrategy', 'Operation', 'PositiveAmount',
                'avg_ps_ChannelId', 'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                'avg_ps_ProductId', 'rt_avg_ps_ProductId']
columns_to_remove = ['CurrencyCode', 'CountryCode', 'BatchId', 'AccountId', 'SubscriptionId',
                     'CustomerId', 'TransactionStartTime', 'Amount']
categorical_features = ['ProductId', 'ProductCategory', 'ChannelId']
numerical_features = ['PositiveAmount', 'Operation', 'Value', 'PricingStrategy']
numerical_features_augmented = ['Value', 'PricingStrategy', 'Operation', 'PositiveAmount', 'avg_ps_ChannelId',
                                'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                                'avg_ps_ProductId', 'rt_avg_ps_ProductId']

label = 'FraudResult'
genuine_label = 'FraudResult==0'
fraud_label = 'FraudResult==1'
categorical_positions = [0, 1, 2, 4];

### Reading Data

In [2]:
# Read Fraud Detection Challenge data
train_data = io.read_spark_data_frame(param.get_file_name('training_data'))

# Create new features and remove the non used features
train_data = preprocess.get_features_augmentation(train_data)
train_data = train_data.drop(*columns_to_remove)

In [16]:
x = train_data[numerical_features_augmented].toPandas()
y = train_data.select(label).toPandas()

In [28]:
x = np.array(x)
y = np.array(y.iloc[:,0])

### Isolation Forest

In [57]:
import pandas as pd
#x_data, y_data = bal.balance_using_smotenc(train_data, all_features, label, categorical_positions)
x_data = pd.read_csv('../data/x_data_smotenc.csv')
y_data = pd.read_csv('../data/y_data_smotenc.csv')

x = np.array(x_data.iloc[:,:])
y = np.array(y_data.iloc[:,0])

In [136]:
import sklearn.ensemble as ens
isolation = ens.IsolationForest(behaviour='new', max_samples=15000,
                                         random_state=42, contamination=0.5)
isolation.fit(x[:,3:])

IsolationForest(behaviour='new', bootstrap=False, contamination=0.5,
                max_features=1.0, max_samples=15000, n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

In [164]:
isolation_pred = isolation.predict(x[:,3:])
new_x = np.column_stack([x[:,3:], isolation_pred])

### CatBoost

In [29]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(max_depth=8, iterations=300)
model.fit(x, y);

Learning rate set to 0.178057
0:	learn: 0.1769156	total: 76.2ms	remaining: 22.8s
1:	learn: 0.0509308	total: 96.1ms	remaining: 14.3s
2:	learn: 0.0181710	total: 111ms	remaining: 11s
3:	learn: 0.0084218	total: 126ms	remaining: 9.3s
4:	learn: 0.0048271	total: 140ms	remaining: 8.24s
5:	learn: 0.0032907	total: 154ms	remaining: 7.54s
6:	learn: 0.0025729	total: 168ms	remaining: 7.02s
7:	learn: 0.0022306	total: 183ms	remaining: 6.67s
8:	learn: 0.0020166	total: 206ms	remaining: 6.65s
9:	learn: 0.0019141	total: 227ms	remaining: 6.59s
10:	learn: 0.0018603	total: 241ms	remaining: 6.34s
11:	learn: 0.0017527	total: 256ms	remaining: 6.14s
12:	learn: 0.0016876	total: 270ms	remaining: 5.96s
13:	learn: 0.0016371	total: 284ms	remaining: 5.81s
14:	learn: 0.0016058	total: 298ms	remaining: 5.67s
15:	learn: 0.0015835	total: 312ms	remaining: 5.53s
16:	learn: 0.0015623	total: 326ms	remaining: 5.43s
17:	learn: 0.0015427	total: 340ms	remaining: 5.33s
18:	learn: 0.0015298	total: 355ms	remaining: 5.25s
19:	learn: 0

165:	learn: 0.0012855	total: 2.74s	remaining: 2.21s
166:	learn: 0.0012855	total: 2.75s	remaining: 2.19s
167:	learn: 0.0012854	total: 2.77s	remaining: 2.18s
168:	learn: 0.0012854	total: 2.78s	remaining: 2.16s
169:	learn: 0.0012853	total: 2.8s	remaining: 2.14s
170:	learn: 0.0012853	total: 2.81s	remaining: 2.12s
171:	learn: 0.0012852	total: 2.82s	remaining: 2.1s
172:	learn: 0.0012852	total: 2.84s	remaining: 2.08s
173:	learn: 0.0012851	total: 2.85s	remaining: 2.06s
174:	learn: 0.0012851	total: 2.86s	remaining: 2.04s
175:	learn: 0.0012850	total: 2.88s	remaining: 2.03s
176:	learn: 0.0012850	total: 2.89s	remaining: 2.01s
177:	learn: 0.0012850	total: 2.9s	remaining: 1.99s
178:	learn: 0.0012849	total: 2.91s	remaining: 1.97s
179:	learn: 0.0012849	total: 2.93s	remaining: 1.95s
180:	learn: 0.0012849	total: 2.94s	remaining: 1.93s
181:	learn: 0.0012848	total: 2.96s	remaining: 1.92s
182:	learn: 0.0012848	total: 2.97s	remaining: 1.9s
183:	learn: 0.0012848	total: 2.98s	remaining: 1.88s
184:	learn: 0.00

### Fixing feature augmentation

In [34]:
# Evaluating with test data
test_data = io.read_spark_data_frame(param.get_file_name('testing_data'))
transactions_list = [item for item in test_data.toPandas()['TransactionId']]
#test_data = np.array(test_data[all_features].toPandas())

In [35]:
test_data = preprocess.get_features_augmentation(test_data)

AnalysisException: "cannot resolve '`FraudResult`' given input columns: [ChannelId, TransactionStartTime, AccountId, CustomerId, CurrencyCode, BatchId, CountryCode, Operation, PositiveAmount, Value, ProductId, Amount, TransactionId, SubscriptionId, ProductCategory, PricingStrategy, ProviderId]; line 1 pos 0;\n'Filter ('FraudResult = 0)\n+- Project [TransactionId#816, BatchId#817, AccountId#818, SubscriptionId#819, CustomerId#820, CurrencyCode#821, CountryCode#822L, ProviderId#823, ProductId#824, ProductCategory#825, ChannelId#826, Amount#827, Value#828L, TransactionStartTime#829, PricingStrategy#830L, Operation#846, abs(Amount#827) AS PositiveAmount#863]\n   +- Project [TransactionId#816, BatchId#817, AccountId#818, SubscriptionId#819, CustomerId#820, CurrencyCode#821, CountryCode#822L, ProviderId#823, ProductId#824, ProductCategory#825, ChannelId#826, Amount#827, Value#828L, TransactionStartTime#829, PricingStrategy#830L, CASE WHEN (Amount#827 > cast(0 as double)) THEN 1 WHEN (Amount#827 < cast(0 as double)) THEN -1 ELSE 0 END AS Operation#846]\n      +- LogicalRDD [TransactionId#816, BatchId#817, AccountId#818, SubscriptionId#819, CustomerId#820, CurrencyCode#821, CountryCode#822L, ProviderId#823, ProductId#824, ProductCategory#825, ChannelId#826, Amount#827, Value#828L, TransactionStartTime#829, PricingStrategy#830L], false\n"

In [171]:
t_data = np.array(test_data[numerical_features_augmented].toPandas())
isolation_pred = isolation.predict(t_data)
t_data = np.column_stack([t_data, isolation_pred])

In [30]:
predictions = model.predict(t_data)

NameError: name 't_data' is not defined

In [176]:
data = (predictions+1) % 2           
io.save_predictions_xente('../data/predictions_000.txt', transactions_list, data)