In [0]:
import timeit
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, \
                                    cross_val_score
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import plotly.tools as pt
import plotly.io as pio
import chart_studio as cs
import plotly.graph_objs as go
import seaborn as sns
sns.set('paper')

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
transaction_df = pd.read_csv('gdrive/My Drive/ieee-fraud-detection/train_transaction.csv')

In [91]:
transaction_df.shape

(590540, 394)

In [0]:
cat_cols = []
for column in transaction_df.columns:
  if (transaction_df[column].dtype == np.dtype('O')) or \
  transaction_df[column].value_counts().keys().shape[0] <= 2:
    cat_cols.append(column)

In [0]:
dummies = pd.get_dummies(transaction_df[cat_cols].astype('category'),drop_first=True,dummy_na=True).astype('category')

In [0]:
transaction_df = pd.concat([transaction_df.drop(cat_cols,axis=1), dummies],axis=1)

In [0]:
null_cols = []
for column in transaction_df.columns:
  if transaction_df[column].value_counts().keys().shape[0] == 1:
    null_cols.append(column)

In [0]:
transaction_df = transaction_df.drop(null_cols,axis=1)

In [0]:
transaction_df = transaction_df.rename(columns={'isFraud_1.0':'isFraud'})

In [0]:
card1 = transaction_df.card1.astype('category')

In [0]:
transaction_df = pd.concat([transaction_df.drop('card1',axis=1),card1],axis=1)

In [0]:
data_types = [transaction_df[column].dtype for column in transaction_df.columns]

In [0]:
data_types.reverse()

In [0]:
cat_start_idx = len(data_types) - data_types.index(np.dtype('float64'))

In [0]:
remove_n = 580000
drop_indices = np.random.choice(transaction_df.index,remove_n,replace=False)
transaction_df = transaction_df.drop(drop_indices)

In [203]:
transaction_df.shape

(10540, 537)

Number of NAN per variable barplot

In [9]:
features = transaction_df.isna().sum().keys()
missing = transaction_df.isna().sum().values

fig = go.Figure([go.Bar(x=features, y = missing)])
fig.show()

Class balance barplot

In [204]:
not_fraud_count = transaction_df['isFraud'].value_counts().values[0]
fraud_count = transaction_df['isFraud'].value_counts().values[1]

fig = go.Figure([go.Bar(x=['Fraud','Not Fraud'], y = [fraud_count/transaction_df.shape[0],
                                                      not_fraud_count/transaction_df.shape[0]])])
fig.show()

Train-Test Split

In [0]:
X = np.array(transaction_df.drop(['isFraud','TransactionID'],axis=1))

In [0]:
Y = np.array(transaction_df['isFraud']).reshape((-1,))

In [0]:
del transaction_df

In [0]:
x_train09, x_test09, y_train09, y_test09 = train_test_split(X,Y, test_size = 0.1, stratify = Y)
x_train05, x_test05, y_train05, y_test05 = train_test_split(X,Y, test_size = 0.5, stratify = Y)

Missingness Imputation

In [108]:
x_train09[:,cat_start_idx]
         

array([0, 0, 0, ..., 0, 0, 0], dtype=object)

In [0]:
imputer = SimpleImputer().fit(x_train09)
x_train09 = imputer.transform(x_train09)
x_test09 = imputer.transform(x_test09)

scaler = StandardScaler().fit(x_train09[:,:cat_start_idx])
x_train09 = np.concatenate((scaler.transform(x_train09[:,:cat_start_idx]), x_train09[:,cat_start_idx:]),axis=1)
x_test09 = np.concatenate((scaler.transform(x_test09[:,:cat_start_idx]), x_test09[:,cat_start_idx:]),axis=1)

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1.],
       ...,
       [0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1.]])

Model Fitting for N-learn = 0.9

In [0]:
### Data capture lists: Train/Test/Cross-Validation Errors and Execution Time

logit_train_ls = []
logit_test_ls = []
logit_auc = []


logit_ridge_train_ls = []
logit_ridge_test_ls = []
logit_ridge_auc = []

logit_lasso_train_ls = []
logit_lasso_test_ls = []
logit_lasso_auc = []

rf_train_ls = []
rf_test_ls = []
rf_auc = []

svm_train_ls = []
svm_test_ls = []
svm_auc = []

logit_time = []
logit_ridge_time = []
logit_lasso_time = []
svm_time = []
rf_time = []

logit_ridge_cv_time = []
logit_lasso_cv_time = []
svm_cv_time = []



In [0]:
sample_weights = compute_sample_weight('balanced',y_train09)

### Logistic Regression

# logit = LogisticRegression(penalty = 'none', class_weight = sample_weights,n_jobs=-1,
#                            solver = 'lbfgs')

# t_start = timeit.default_timer()
# logit.fit(x_train09,y_train09,sample_weight=sample_weights)
# t_end = timeit.default_timer()

# train_preds = logit.predict(x_train09)
# test_preds = logit.predict(x_test09)

# logit_train_ls.append(1-accuracy_score(y_train09,train_preds))
# logit_test_ls.append(1-accuracy_score(y_test09, test_preds))

# logit_auc.append(roc_auc_score(y_test09, test_preds))

# logit_time.append(t_end-t_start)

### Logistic Ridge (L2 Penalty)

param_grid_logit = {'C' : np.linspace(1e-1,1e+3,25)}

ridge = LogisticRegression(penalty = 'l2', solver = 'lbfgs', class_weight = sample_weights,
                           n_jobs=-1)

ridgeCV = GridSearchCV(ridge, param_grid_logit, cv = 10, n_jobs = -1)

t_start = timeit.default_timer()
ridgeCV.fit(x_train09, y_train09)
t_end = timeit.default_timer()

logit_ridge_cv_time.append(t_end-t_start)

ridge = ridgeCV.best_estimator_

t_start = timeit.default_timer()
ridge.fit(x_train09, y_train09,sample_weight=sample_weights)
t_end = timeit.default_timer()

logit_ridge_time.append(t_end-t_start)

train_preds = ridge.predict(x_train09)
test_preds = ridge.predict(x_test09)

logit_ridge_train_ls.append(1-accuracy_score(y_train09, train_preds))
logit_ridge_test_ls.append(1-accuracy_score(y_test09, test_preds))
logit_ridge_auc.append(roc_auc_score(y_test09, test_preds))




In [211]:
# print(logit_train_ls,logit_test_ls,logit_auc,logit_time, '\n')
print(logit_ridge_train_ls, logit_ridge_test_ls, logit_ridge_auc, logit_ridge_time, logit_ridge_cv_time)

[0.1955513388150959] [0.20493358633776093] [0.6463897525844429] [4.018269187999977] [138.60740435899788]
