### Importing libs

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
import pyspark.sql.functions as F

import sys
sys.path.insert(0, 'models/')

import balance as bal
import cat_boost as cat
import isolation_forest
import parameters as param
import preprocessing as preprocess
import visualization as vis
import statistics as stats
import io_module as io

# This param when defined as True will execute the complete code, so slowly processing time
# because is require to execute all checks and print all not essential functions.
# When defined as False, a fast processing is applied with all core functionalities working
# well.
full_execution = False
verbose_mode = True

all_features = ['ProductId', 'ProductCategory', 'ChannelId', 'Value', 'PricingStrategy', 'Operation', 'PositiveAmount',
                'avg_ps_ChannelId', 'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                'avg_ps_ProductId', 'rt_avg_ps_ProductId']
columns_to_remove = ['CurrencyCode', 'CountryCode', 'BatchId', 'AccountId', 'SubscriptionId',
                     'CustomerId', 'TransactionStartTime', 'Amount']
categorical_features = ['ProductId', 'ProductCategory', 'ChannelId']
numerical_features = ['PositiveAmount', 'Operation', 'Value', 'PricingStrategy']
numerical_features_augmented = ['Value', 'PricingStrategy', 'Operation', 'PositiveAmount', 'avg_ps_ChannelId',
                                'rt_avg_ps_ChannelId', 'avg_ps_ProductCategory', 'rt_avg_ps_ProductCategory',
                                'avg_ps_ProductId', 'rt_avg_ps_ProductId']

label = 'FraudResult'
genuine_label = 'FraudResult==0'
fraud_label = 'FraudResult==1'
categorical_positions = [0, 1, 2, 4];

### Reading Data

In [2]:
# Read Fraud Detection Challenge data
train_data = io.read_spark_data_frame(param.get_file_name('training_data'))
gen_train_data = train_data.filter('FraudResult == 0')
# Create new features and remove the non used features
train_data = preprocess.get_features_augmentation(train_data, gen_train_data)
train_data = train_data.drop(*columns_to_remove)

In [23]:
train_data = io.read_spark_data_frame(param.get_file_name('training_data'))
train_data = train_data.withColumn("Operation",
                           F.when(train_data.Amount > 0, 1).when(train_data.Amount < 0, -1).otherwise(0))
train_data = train_data.withColumn('PositiveAmount', F.abs(train_data['Amount']))
aux = train_data.filter('FraudResult==0').select(['ChannelId', 'PositiveAmount']).groupBy('ChannelId').mean()


In [40]:
item = "ChannelId"
train_aux = train_data.filter('FraudResult==0').select([item, 'PositiveAmount']).groupBy(item).mean()
test_aux = test_data.select([item, 'PositiveAmount']).groupBy(item).mean()

In [111]:
test_aux.toPandas()

Unnamed: 0,ChannelId,avg(PositiveAmount)
0,ChannelId_1,7558.333333
1,ChannelId_5,84226.910299
2,ChannelId_4,2900.0
3,ChannelId_3,9206.837291
4,ChannelId_2,9691.750708


In [49]:
train_aux.toPandas()

Unnamed: 0,ChannelId,avg(PositiveAmount)
0,ChannelId_1,27438.898876
1,ChannelId_5,5060.92271
2,ChannelId_3,8448.058854
3,ChannelId_2,3835.880507


In [121]:
item='ChannelId'
test_aux.join(train_aux, on=[item], how='outer').where(train_aux.isNull()).toPandas()

#test_aux.join(train_aux, on=[item], how='outer').toPandas()

AttributeError: 'DataFrame' object has no attribute 'isNull'

In [3]:
x = train_data[numerical_features_augmented].toPandas()
y = train_data.select(label).toPandas()

In [4]:
x = np.array(x)
y = np.array(y.iloc[:,0])

### Isolation Forest

In [57]:
import pandas as pd
#x_data, y_data = bal.balance_using_smotenc(train_data, all_features, label, categorical_positions)
x_data = pd.read_csv('../data/x_data_smotenc.csv')
y_data = pd.read_csv('../data/y_data_smotenc.csv')

x = np.array(x_data.iloc[:,:])
y = np.array(y_data.iloc[:,0])

In [17]:
x = train_data[numerical_features_augmented].toPandas()

In [18]:
import sklearn.ensemble as ens
isolation = ens.IsolationForest(behaviour='new', max_samples=500,
                                         random_state=42, contamination=0.003)
#isolation.fit(x[:,3:])
isolation.fit(x)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.003,
                max_features=1.0, max_samples=500, n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

In [19]:
#isolation_pred = isolation.predict(x[:,3:])
isolation.predict(x)
#new_x = np.column_stack([x[:,3:], isolation_pred])

array([1, 1, 1, ..., 1, 1, 1])

### CatBoost

In [5]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(max_depth=8, iterations=300)
model.fit(x, y);

Learning rate set to 0.178057
0:	learn: 0.1769156	total: 76.2ms	remaining: 22.8s
1:	learn: 0.0509308	total: 100ms	remaining: 15s
2:	learn: 0.0181710	total: 145ms	remaining: 14.4s
3:	learn: 0.0084218	total: 174ms	remaining: 12.9s
4:	learn: 0.0048271	total: 198ms	remaining: 11.7s
5:	learn: 0.0032907	total: 225ms	remaining: 11s
6:	learn: 0.0025729	total: 240ms	remaining: 10s
7:	learn: 0.0022306	total: 254ms	remaining: 9.27s
8:	learn: 0.0020166	total: 271ms	remaining: 8.76s
9:	learn: 0.0019141	total: 285ms	remaining: 8.28s
10:	learn: 0.0018603	total: 300ms	remaining: 7.87s
11:	learn: 0.0017527	total: 314ms	remaining: 7.54s
12:	learn: 0.0016876	total: 329ms	remaining: 7.25s
13:	learn: 0.0016371	total: 343ms	remaining: 7s
14:	learn: 0.0016058	total: 357ms	remaining: 6.78s
15:	learn: 0.0015835	total: 370ms	remaining: 6.57s
16:	learn: 0.0015623	total: 384ms	remaining: 6.4s
17:	learn: 0.0015427	total: 398ms	remaining: 6.24s
18:	learn: 0.0015298	total: 424ms	remaining: 6.26s
19:	learn: 0.0015225

164:	learn: 0.0012856	total: 2.54s	remaining: 2.08s
165:	learn: 0.0012855	total: 2.56s	remaining: 2.07s
166:	learn: 0.0012855	total: 2.57s	remaining: 2.05s
167:	learn: 0.0012854	total: 2.58s	remaining: 2.03s
168:	learn: 0.0012854	total: 2.6s	remaining: 2.01s
169:	learn: 0.0012853	total: 2.61s	remaining: 2s
170:	learn: 0.0012853	total: 2.62s	remaining: 1.98s
171:	learn: 0.0012852	total: 2.63s	remaining: 1.96s
172:	learn: 0.0012852	total: 2.65s	remaining: 1.94s
173:	learn: 0.0012851	total: 2.66s	remaining: 1.92s
174:	learn: 0.0012851	total: 2.67s	remaining: 1.91s
175:	learn: 0.0012850	total: 2.68s	remaining: 1.89s
176:	learn: 0.0012850	total: 2.69s	remaining: 1.87s
177:	learn: 0.0012850	total: 2.71s	remaining: 1.85s
178:	learn: 0.0012849	total: 2.72s	remaining: 1.84s
179:	learn: 0.0012849	total: 2.73s	remaining: 1.82s
180:	learn: 0.0012849	total: 2.75s	remaining: 1.81s
181:	learn: 0.0012848	total: 2.77s	remaining: 1.79s
182:	learn: 0.0012848	total: 2.78s	remaining: 1.77s
183:	learn: 0.00

### Fixing feature augmentation

In [20]:
# Evaluating with test data
test_data = io.read_spark_data_frame(param.get_file_name('testing_data'))
transactions_list = [item for item in test_data.toPandas()['TransactionId']]
#test_data = np.array(test_data[all_features].toPandas())

In [21]:
test_data = preprocess.get_features_augmentation(test_data, gen_train_data)
t_data = test_data[numerical_features_augmented].toPandas()

AnalysisException: "cannot resolve '`FraudResult`' given input columns: [PositiveAmount, Operation, TransactionStartTime, PricingStrategy, ProductCategory, TransactionId, ChannelId, CountryCode, CustomerId, AccountId, CurrencyCode, SubscriptionId, ProviderId, BatchId, Amount, ProductId, Value]; line 1 pos 0;\n'Filter ('FraudResult = 0)\n+- Project [TransactionId#717, BatchId#718, AccountId#719, SubscriptionId#720, CustomerId#721, CurrencyCode#722, CountryCode#723L, ProviderId#724, ProductId#725, ProductCategory#726, ChannelId#727, Amount#728, Value#729L, TransactionStartTime#730, PricingStrategy#731L, Operation#747, abs(Amount#728) AS PositiveAmount#764]\n   +- Project [TransactionId#717, BatchId#718, AccountId#719, SubscriptionId#720, CustomerId#721, CurrencyCode#722, CountryCode#723L, ProviderId#724, ProductId#725, ProductCategory#726, ChannelId#727, Amount#728, Value#729L, TransactionStartTime#730, PricingStrategy#731L, CASE WHEN (Amount#728 > cast(0 as double)) THEN 1 WHEN (Amount#728 < cast(0 as double)) THEN -1 ELSE 0 END AS Operation#747]\n      +- LogicalRDD [TransactionId#717, BatchId#718, AccountId#719, SubscriptionId#720, CustomerId#721, CurrencyCode#722, CountryCode#723L, ProviderId#724, ProductId#725, ProductCategory#726, ChannelId#727, Amount#728, Value#729L, TransactionStartTime#730, PricingStrategy#731L], false\n"

In [171]:
t_data = np.array(test_data[numerical_features_augmented].toPandas())
isolation_pred = isolation.predict(t_data)
t_data = np.column_stack([t_data, isolation_pred])

In [30]:
predictions = model.predict(t_data)

NameError: name 't_data' is not defined

In [176]:
data = (predictions+1) % 2           
io.save_predictions_xente('../data/predictions_000.txt', transactions_list, data)