In [1]:
import json
from os import listdir
from os.path import isfile, join
import pandas as pd

### 1 | Load Data from File to DataFrame

In [2]:
path = 'data/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [3]:
def load_file(file):
    
    f = open(path + file, "r")
    if f.mode == 'r':
        files = f.readlines()
    f.close()
    
    return files

In [4]:
# load files to dataframe
df = pd.DataFrame()

for file in files:
    df_loop = pd.read_json(path + files[0], lines=True)
    df = df.append(df_loop)

In [5]:
# replace basket with individual columns
for basket in set.union(*df.basket.apply(set)):
    df[basket] = df.apply(lambda _: int(_.basket.count(basket)), axis=1)
    
df.drop('basket', axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,fraudLabel,totalAmount,transactionId,zipCode,0,1,2,3,4,5
0,0,693,6543306520,8600,1,2,1,0,1,4
1,0,60,4690422808,1948,2,2,0,0,1,1
2,0,142,1686204649,8518,0,2,0,0,0,0
3,0,286,9619883092,1204,3,1,3,0,1,3
4,0,15,9770478572,6487,0,0,0,0,1,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31000 entries, 0 to 999
Data columns (total 10 columns):
fraudLabel       31000 non-null int64
totalAmount      31000 non-null int64
transactionId    31000 non-null int64
zipCode          31000 non-null int64
0                31000 non-null int64
1                31000 non-null int64
2                31000 non-null int64
3                31000 non-null int64
4                31000 non-null int64
5                31000 non-null int64
dtypes: int64(10)
memory usage: 2.6 MB


### 2 | Engineer Features

In [8]:
df.drop('transactionId', axis=1, inplace=True)

In [9]:
from category_encoders.one_hot import OneHotEncoder

In [10]:
df_ohe = df.copy()

In [11]:
df_ohe["zipCode"] = df_ohe["zipCode"].astype('category',categories=[i for i in list(range(1000,10000))])
dummies = pd.get_dummies(df_ohe.zipCode)

  """Entry point for launching an IPython kernel.


In [12]:
df_ohe = pd.concat([df_ohe, dummies], axis=1)
df_ohe.head()

Unnamed: 0,fraudLabel,totalAmount,zipCode,0,1,2,3,4,5,1000,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,693,8600,1,2,1,0,1,4,0,...,0,0,0,0,0,0,0,0,0,0
1,0,60,1948,2,2,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,142,8518,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,286,1204,3,1,3,0,1,3,0,...,0,0,0,0,0,0,0,0,0,0
4,0,15,6487,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#ohe = OneHotEncoder(cols=['zipCode'])
#df_ohe = ohe.fit_transform(df_ohe)
#df_ohe.head()

### 3 | Train Model

#### 3.1 | Logistic Regression

In [14]:
from sklearn.model_selection import train_test_split
x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(df_ohe.drop('fraudLabel', axis=1), 
                                                                df_ohe['fraudLabel'], 
                                                                test_size=0.3, 
                                                                random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
log = LogisticRegression(solver='lbfgs', C=1, random_state=42)
log.fit(x_train_lr, y_train_lr)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
log.score(x_test_lr, y_test_lr)

0.979247311827957

In [18]:
y_pred_lr = log.predict(x_test_lr)

#### 3.2 | Gradient Boosted Tree

In [19]:
from sklearn.model_selection import train_test_split
x_train_gbt, x_test_gbt, y_train_gbt, y_test_gbt = train_test_split(df.drop('fraudLabel', axis=1), 
                                                                    df['fraudLabel'], 
                                                                    test_size=0.3, 
                                                                    random_state=42)

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
gbt = GradientBoostingClassifier()
gbt.fit(x_train_gbt, y_train_gbt)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [23]:
gbt.score(x_test_gbt, y_test_gbt)

0.9944086021505376

In [24]:
y_pred_gbt = gbt.predict(x_test_gbt)

In [25]:
gbt = GradientBoostingClassifier()
gbt.fit(x_train_lr, y_train_lr)

1.0

In [26]:
gbt.score(x_test_lr, y_test_lr)

1.0