# Building a basic model

In [28]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [7]:
REL_DATA_PATH = "../data"
TRAIN_TRANSACTION_PATH=f"{REL_DATA_PATH}/train_transaction.csv"
TRAIN_IDENTITY_PATH=f"{REL_DATA_PATH}/train_identity.csv"
TEST_TRANSACTION_PATH=f"{REL_DATA_PATH}/test_transaction.csv"
TEST_IDENTITY_PATH=f"{REL_DATA_PATH}/test_identity.csv"

In [8]:
train_full = pd.read_csv(TRAIN_TRANSACTION_PATH)

In [9]:
train_full.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
raw_features = ["TransactionAmt","ProductCD","card1","card4", "isFraud"]

In [11]:
train_subset = train_full[raw_features]

In [12]:
train_subset.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card4,isFraud
0,68.5,W,13926,discover,0
1,29.0,W,2755,mastercard,0
2,59.0,W,4663,visa,0
3,50.0,W,18132,mastercard,0
4,50.0,H,4497,mastercard,0


In [19]:
train_subset = train_subset.dropna(axis=0, how='any')

In [30]:
product_cd_encoder = LabelEncoder()
product_cd_encoder.fit(train_subset['ProductCD'])
product_cd_encoder.classes_

array(['C', 'H', 'R', 'S', 'W'], dtype=object)

In [31]:
train_subset['ProductCD'] = product_cd_encoder.transform(train_subset['ProductCD'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_subset['ProductCD'] = product_cd_encoder.transform(train_subset['ProductCD'])


In [34]:
card4_encoder = LabelEncoder()
train_subset['card4'] = card4_encoder.fit_transform(train_subset['card4'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_subset['card4'] = card4_encoder.fit_transform(train_subset['card4'])


In [36]:
train_subset.dtypes

TransactionAmt    float64
ProductCD           int64
card1               int64
card4               int64
isFraud             int64
dtype: object

In [37]:
train_df, test_df = train_test_split(train_subset,stratify=train_subset['isFraud'].to_numpy())

In [38]:
test_df.groupby('isFraud').count()

Unnamed: 0_level_0,TransactionAmt,ProductCD,card1,card4
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,142085,142085,142085,142085
1,5156,5156,5156,5156


In [39]:
train_df.groupby('isFraud').count()

Unnamed: 0_level_0,TransactionAmt,ProductCD,card1,card4
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,426256,426256,426256,426256
1,15466,15466,15466,15466


In [41]:
d_train = xgb.DMatrix(train_df.drop(['isFraud'], axis=1),label=train_df['isFraud'])

In [42]:
d_test = xgb.DMatrix(test_df.drop(['isFraud'], axis=1),label=test_df['isFraud'])

In [43]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

In [54]:
eval_list = [(d_test, 'eval'), (d_train, 'train')]


In [55]:
num_round = 10

In [56]:
with open("../neptune.token", 'r') as f:
    neptune_token = f.readline()

In [57]:
neptune_project = "atillek/ieee"

In [58]:
import neptune.new as neptune
from neptune.new.integrations.xgboost import NeptuneCallback

In [59]:
with neptune.init(project=neptune_project, api_token=neptune_token) as run:
    neptune_callback = NeptuneCallback(run=run, log_tree=[0,1,2,3])
    bst = xgb.train(params=param, dtrain=d_train,num_boost_round=num_round, callbacks=[neptune_callback], evals=eval_list)

https://app.neptune.ai/atillek/ieee/e/IEEE-2
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
[0]	eval-auc:0.70285	train-auc:0.69928
[1]	eval-auc:0.70285	train-auc:0.69928
[2]	eval-auc:0.72117	train-auc:0.71840
[3]	eval-auc:0.72535	train-auc:0.72157




[4]	eval-auc:0.72790	train-auc:0.72470
[5]	eval-auc:0.73464	train-auc:0.73216
[6]	eval-auc:0.73528	train-auc:0.73299
[7]	eval-auc:0.73672	train-auc:0.73435
[8]	eval-auc:0.74027	train-auc:0.73633
[9]	eval-auc:0.74370	train-auc:0.73953
Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 111 operations to synchronize with Neptune. Do not kill this process.


All 111 operations synced, thanks for waiting!


Hooray, sseems to work. but we need to start building some standards.