In [6]:
import numpy as np
import pandas as pd

In [8]:
#Based on this great kernel https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [44]:
# Import train and test data 
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,train_199995,0,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
199996,train_199996,0,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
199997,train_199997,0,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
199998,train_199998,0,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


In [10]:
# reducing the size of train data 

train, NAlist = reduce_mem_usage(train)

Memory usage of properties dataframe is : 308.2276153564453  MB
******************************
Column:  target
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  var_0
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_1
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_2
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_3
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_4
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_5
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_6
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_67
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_68
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_69
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_70
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_71
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_72
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_73
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_74
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_134
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_135
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_136
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_137
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_138
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_139
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_140
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_141
dtype before:  float64
dtype af

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float32(200), object(1), uint8(1)
memory usage: 154.3+ MB


In [12]:
# reducing the size of train data 


test, NAlist = reduce_mem_usage(test)

Memory usage of properties dataframe is : 306.7017364501953  MB
******************************
Column:  var_0
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_1
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_2
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_3
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_4
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_5
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_6
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_7
dtype before:  float64
dtype after: 

dtype after:  float32
******************************
******************************
Column:  var_67
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_68
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_69
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_70
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_71
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_72
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_73
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_74
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_134
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_135
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_136
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_137
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_138
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_139
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_140
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_141
dtype before:  float64
dtype af

In [13]:
# Dropped the ID_code from train and test Data 

train.drop(['ID_code'],inplace=True, axis=1)
test.drop(['ID_code'],inplace=True, axis=1)

In [14]:
# Creating X and Y from train dataset

X = train.values[:, 1:]
y = train.values[:, 0]

In [15]:
# Using stratified shuffle split target value splitted in 20% ratio with split of train and test

from sklearn.model_selection import StratifiedShuffleSplit
split= StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
for train_index, test_index in split.split(train, train['target']):
    strat_train_set = train.loc[train_index]
    strat_test_set = train.loc[test_index]

In [16]:
# Creating train and test data 

X_train = strat_train_set.values[:, 1:]
Y_train = strat_train_set.values[:, 0]
X_test = strat_test_set.values[:,1:]
Y_test = strat_test_set.values[:,0]

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Logistic Regression

In [18]:
# Passing the Standard Scaler and Algorithms to pipeline 

pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(random_state=1))

# Fitting the X_train and Y_train with pipeline
pipe_lr.fit(X_train, Y_train)
Y_pred_lr = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, Y_test))

Test Accuracy: 0.914


In [19]:
# Confusion Matrix 
confmat = confusion_matrix(y_true=Y_test, y_pred=Y_pred_lr)
print(confmat)

[[35497   483]
 [ 2946  1074]]


In [20]:
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, Y_pred_lr))
print('Precision: %.3f' % precision_score(y_true=Y_test, y_pred=Y_pred_lr))
print('Recall: %.3f' % recall_score(y_true=Y_test, y_pred=Y_pred_lr))
print('f1: %.3f' % f1_score(y_true=Y_test, y_pred=Y_pred_lr))

ROC_AUC_Score: 0.627
Precision: 0.690
Recall: 0.267
f1: 0.385


In [55]:
# Creating Pipeline for different Algorithms 

pipe_dt = make_pipeline(StandardScaler(),DecisionTreeClassifier()) 
pipe_knn = make_pipeline(StandardScaler(),KNeighborsClassifier()) 
pipe_svc = make_pipeline(StandardScaler(),SVC()) 

### Decision Tree 

In [56]:
pipe_dt.fit(X_train, Y_train)
Y_pred_dt = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_dt.score(X_test, Y_test))

Test Accuracy: 0.836


In [57]:
confmat = confusion_matrix(Y_test,Y_pred_dt)
print(confmat)

[[28173  7807]
 [  857  3163]]


In [58]:
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, Y_pred_dt))
print('Precision: %.3f' % precision_score(Y_test, Y_pred_dt))
print('Recall: %.3f' % recall_score(Y_test, Y_pred_dt))
print('f1: %.3f' % f1_score(Y_test, Y_pred_dt))

ROC_AUC_Score: 0.785
Precision: 0.288
Recall: 0.787
f1: 0.422


### KNN

In [59]:
pipe_knn.fit(X_train, Y_train)
Y_pred_knn = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_knn.score(X_test, Y_test))

Test Accuracy: 0.899


In [60]:
confmat = confusion_matrix(Y_test,Y_pred_knn)
print(confmat)

[[28173  7807]
 [  857  3163]]


In [61]:
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, Y_pred_knn))
print('Precision: %.3f' % precision_score(Y_test, Y_pred_knn))
print('Recall: %.3f' % recall_score(Y_test, Y_pred_knn))
print('f1: %.3f' % f1_score(Y_test, Y_pred_knn))

ROC_AUC_Score: 0.785
Precision: 0.288
Recall: 0.787
f1: 0.422


### SVC 

In [62]:
pipe_svc.fit(X_train, Y_train)
Y_pred_svc = pipe_svc.predict(X_test)
print('Test Accuracy: %.3f' % pipe_svc.score(X_test, Y_test))

Test Accuracy: 0.916


In [63]:
confmat = confusion_matrix(Y_test,Y_pred_svc)
print(confmat)

[[35695   285]
 [ 3075   945]]


In [64]:
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, Y_pred_svc))
print('Precision: %.3f' % precision_score(Y_test, Y_pred_svc))
print('Recall: %.3f' % recall_score(Y_test, Y_pred_svc))
print('f1: %.3f' % f1_score(Y_test, Y_pred_svc))

ROC_AUC_Score: 0.614
Precision: 0.768
Recall: 0.235
f1: 0.360


## Resampling Techinique

In [21]:
from sklearn.utils import resample


In [22]:
X1_train=pd.DataFrame(X_train)
Y1_train=pd.DataFrame(Y_train)
Y1_train.columns=['Class']


In [23]:
Df = pd.concat([X1_train, Y1_train], axis=1)
Df.groupby('Class').size()

Class
0.0    143922
1.0     16078
dtype: int64

In [24]:
# separate minority and majority classes
not_fraud = Df[Df.Class==0]
fraud = Df[Df.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Class.value_counts()

1.0    143922
0.0    143922
Name: Class, dtype: int64

In [25]:
# trying logistic regression again with the balanced dataset
Y2_train = upsampled.Class
X2_train = upsampled.drop('Class', axis=1)

upsampled =pipe_lr.fit(X2_train, Y2_train)

upsampled_pred = upsampled.predict(X_test)

upsampled

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=1,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [33]:
# Checking accuracy
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, upsampled_pred))
print('Precision: %.3f' % precision_score(Y_test, upsampled_pred))
print('Recall: %.3f' % recall_score(Y_test, upsampled_pred))
print('f1: %.3f' % f1_score(Y_test, upsampled_pred))

ROC_AUC_Score: 0.784
Precision: 0.288
Recall: 0.785
f1: 0.421


In [34]:
pd.DataFrame(confusion_matrix(Y_test, upsampled_pred))

Unnamed: 0,0,1
0,28171,7809
1,864,3156


In [35]:
# still using our separated classes fraud and not_fraud from above

# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

# checking counts
downsampled.Class.value_counts()

1.0    16078
0.0    16078
Name: Class, dtype: int64

In [36]:
Y3_train = downsampled.Class
X3_train = downsampled.drop('Class', axis=1)

undersampled =pipe_lr.fit(X3_train, Y3_train)

undersampled_pred = undersampled.predict(X_test)

In [38]:
print('ROC_AUC_Score: %.3f' %roc_auc_score(Y_test, undersampled_pred))
print('Precision: %.3f' % precision_score(Y_test, undersampled_pred))
print('Recall: %.3f' % recall_score(Y_test, undersampled_pred))
print('f1: %.3f' % f1_score(Y_test, undersampled_pred))

ROC_AUC_Score: 0.785
Precision: 0.288
Recall: 0.787
f1: 0.422


In [39]:
pd.DataFrame(confusion_matrix(Y_test, undersampled_pred))

Unnamed: 0,0,1
0,28173,7807
1,857,3163
