In [1]:
import numpy as np
import pandas as pd
import logging
import pickle
import matplotlib.pyplot as plt
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
# from imblearn.pipeline import Pipeline as Pipe

In [2]:
df = pd.read_csv('creditcard.csv').sample(50000)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
241104,150884.0,1.792157,-0.094959,-1.962566,1.363191,0.419663,-1.377915,0.97381,-0.52912,-0.115885,...,0.207575,0.388849,-0.125439,-0.015856,0.395331,-0.494935,-0.057501,-0.04041,144.0,0
283120,171382.0,0.008018,0.800285,-1.718762,-1.570926,3.100557,3.225469,0.362243,0.882363,-0.094758,...,-0.316859,-0.906714,0.117253,0.592073,-0.435896,0.151121,0.108751,0.068515,2.97,0
100341,67485.0,1.343434,-1.209012,1.171484,-0.448716,-1.422045,1.132678,-1.758413,0.508892,0.99979,...,0.081868,0.574081,-0.165297,-0.891845,0.492919,0.051442,0.092295,0.009995,1.0,0
241636,151108.0,-0.408453,1.012431,-0.461966,-0.632573,0.95044,-0.611025,0.869306,-0.108494,-0.13483,...,-0.256491,-0.662153,0.242027,0.665976,-0.457507,0.054971,-0.157047,0.071566,8.13,0
17892,29008.0,-0.999558,-1.022561,2.011569,-0.685867,-2.599713,1.702756,1.879199,-0.531774,-0.722221,...,-0.042948,0.438317,-0.045252,0.059023,0.24393,-0.16512,-0.093542,-0.52979,545.94,0


In [3]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,94954.17706,-0.010086,0.006985,-0.003299,0.003446,-0.003947,-0.004594,-0.01185,-0.00094,-0.002132,...,-0.004176,-0.002853,0.002874,-0.002041,0.000266,-0.003115,-0.000414,-0.000393,87.067203,0.00214
std,47443.39862,1.978498,1.653131,1.526541,1.421624,1.357999,1.322559,1.2238,1.263333,1.104728,...,0.76535,0.727799,0.586143,0.605701,0.51745,0.482184,0.391954,0.324199,238.775018,0.046211
min,2.0,-35.698345,-42.172688,-32.454198,-5.401678,-23.236403,-21.929312,-43.557242,-73.216718,-13.320155,...,-34.830382,-8.887017,-26.751119,-2.836627,-7.495741,-2.24162,-9.544855,-8.478686,0.0,0.0
25%,54472.75,-0.921317,-0.597221,-0.899501,-0.853005,-0.696626,-0.768206,-0.557821,-0.209032,-0.65148,...,-0.231245,-0.542364,-0.161564,-0.358604,-0.315829,-0.330267,-0.071384,-0.053957,5.78,0.0
50%,84818.5,0.017246,0.062965,0.18116,-0.012243,-0.064681,-0.27216,0.038714,0.023614,-0.051649,...,-0.031319,0.0042,-0.01116,0.040223,0.01441,-0.057162,0.001077,0.010625,22.0,0.0
75%,139541.5,1.316137,0.81171,1.032397,0.742926,0.60547,0.398703,0.561717,0.32818,0.601351,...,0.18404,0.525378,0.147605,0.438602,0.348539,0.234472,0.090624,0.07882,76.65,0.0
max,172778.0,2.420346,22.057729,3.940337,12.114672,32.911462,16.614054,25.971058,18.748872,10.370658,...,27.202839,10.50309,16.722816,3.949245,4.381129,3.517346,10.135597,16.129609,12910.93,1.0


In [4]:
df.any().isnull()

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

In [5]:
df.Class.value_counts()

0    49893
1      107
Name: Class, dtype: int64

In [6]:
df.Class.value_counts() / len(df)

0    0.99786
1    0.00214
Name: Class, dtype: float64

In [7]:
X = df.drop(columns='Class')
y = df.Class

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35000, 30), (15000, 30), (35000,), (15000,))

In [9]:
y_train.value_counts()

0    34931
1       69
Name: Class, dtype: int64

## Using class_weight

In [10]:
def run_Logit(X_train, X_test, y_train, y_test, class_weight = None, sample_weight = None):
    
    # weights introduced here
    logit = LogisticRegression(
        penalty='l2',
        solver='newton-cg',
        random_state=69,
        max_iter=10000,
        n_jobs=-1,
        class_weight=class_weight # weights / cost
    )
    
    logit.fit(X_train, y_train, sample_weight=sample_weight)

    print('Train set')
    pred = logit.predict_proba(X_train)
    print(
        'Roc-Auc score: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = logit.predict_proba(X_test)
    print(
        'Roc-Auc score: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [11]:
run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight=None,
          sample_weight=None)

Train set
Roc-Auc score: 0.9847222619831477
Test set
Roc-Auc score: 0.9608692899204301


In [12]:
run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight='balanced',
          sample_weight=None)

Train set
Roc-Auc score: 0.9931896795297064
Test set
Roc-Auc score: 0.9583013810425007


In [13]:
run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight={0:1, 1:1000}, #imbalance ratio
          sample_weight=None)

Train set
Roc-Auc score: 0.994214474166255
Test set
Roc-Auc score: 0.960081328840079


## Using sample_weight

In [14]:
run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight=None,
          sample_weight=None)

Train set
Roc-Auc score: 0.9847222619831477
Test set
Roc-Auc score: 0.9608692899204301


In [15]:
run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight=None,
          sample_weight=np.where(y_train==1,100,1))

Train set
Roc-Auc score: 0.9898669799965896
Test set
Roc-Auc score: 0.9537310308922955


## Estimating Cost with Cross Validation

In [16]:
try:
   grid_search = pickle.load( open( "csl_grid.p", "rb" ) )
except:

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    pca = PCA(n_components=8)

    selection = SelectKBest(k=2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    X_features = combined_features.fit(X, y).transform(X)
    print("Combined space has", X_features.shape[1], "features")

    pipeline = Pipeline([('scale', StandardScaler()),("features", combined_features), ("clf", RandomForestClassifier(random_state=69))])

    parameters = {
        'clf__n_estimators': [10, 50, 100],
        'clf__max_depth': [2, 3],
        'clf__class_weight': [None, {0:1, 1:10}, {0:1, 1:50}, {0:1, 1:100}, {0:1, 1:400}],
    }
    if __name__ == "__main__":
        grid_search = GridSearchCV(pipeline, parameters,scoring='roc_auc', n_jobs=-1,cv=5,verbose=4)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipeline.steps])
        t0 = time()
        grid_search.fit(X, y)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best estimator: ", grid_search.best_estimator_)
        pickle.dump( grid_search, open( "csl_grid.p", "wb" ) )

In [17]:
df_ = pd.DataFrame(grid_search.cv_results_)
df_.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__max_depth,param_clf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.968152,0.02226,0.026172,0.00421,,2,10,"{'clf__class_weight': None, 'clf__max_depth': ...",0.948845,0.95037,0.95193,0.903922,0.879564,0.926926,0.029758,27
1,2.853076,0.074056,0.05238,0.00313,,2,50,"{'clf__class_weight': None, 'clf__max_depth': ...",0.921157,0.943858,0.950405,0.901732,0.894113,0.922253,0.022242,30
2,4.865098,0.046628,0.090981,0.000814,,2,100,"{'clf__class_weight': None, 'clf__max_depth': ...",0.943953,0.941916,0.969713,0.943591,0.874386,0.934712,0.031876,26
3,0.944487,0.021041,0.026308,0.005049,,3,10,"{'clf__class_weight': None, 'clf__max_depth': ...",0.931746,0.950353,0.951393,0.918088,0.878605,0.926037,0.026757,29
4,3.449157,0.047925,0.058418,0.003141,,3,50,"{'clf__class_weight': None, 'clf__max_depth': ...",0.948034,0.938953,0.984317,0.937762,0.866885,0.93519,0.038118,25


In [18]:
grid_search.best_params_

{'clf__class_weight': {0: 1, 1: 50},
 'clf__max_depth': 3,
 'clf__n_estimators': 50}

In [19]:
rf = RandomForestClassifier(n_estimators=50,
                            random_state=69,
                            max_depth=3,
                            n_jobs=-1,
                            class_weight=None)
rf_cs = RandomForestClassifier(n_estimators=50,
                            random_state=69,
                            max_depth=3,
                            n_jobs=-1,
                            class_weight={0: 1, 1: 50})                            

In [20]:
rf.fit(X_train,y_train)
rf_cs.fit(X_train,y_train)
print(rf.score(X_test,y_test),rf_cs.score(X_test,y_test))

0.9985333333333334 0.9989333333333333


## MetaCost

In [21]:
from metacost import MetaCost

In [22]:
cost_matrix = np.array([[0, 1], [1, 0]])
cost_matrix2 = np.array([[0, 50], [1, 0]])

In [23]:

logreg = LogisticRegression(
    penalty='l2',
    solver='newton-cg',
    random_state=69,
    max_iter=10000,
    n_jobs=-1,
)

In [24]:
metacost_ = MetaCost(estimator=logreg,
                     cost_matrix=cost_matrix,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)
metacost2 = MetaCost(estimator=logreg,
                     cost_matrix=cost_matrix2,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)                                     

In [25]:
try:
   metacost_ = pickle.load( open( "meta_1.p", "rb" ) )
except:
    metacost_.fit(X_train, y_train)
    pickle.dump( metacost_, open( "meta_1.p", "wb" ) )

In [26]:
print('Without Costs')
print('Train set')
pred = metacost_.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost_.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Without Costs
Train set
MetaCost roc-auc: 0.9451830295667775
Test set
MetaCost roc-auc: 0.9517145188864421


In [27]:
try:
   metacost2 = pickle.load( open( "meta_2.p", "rb" ) )
except:
    metacost2.fit(X_train, y_train)
    pickle.dump( metacost2, open( "meta_2.p", "wb" ) )

In [28]:
print('With Costs')
print('Train set')
pred = metacost2.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost2.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

With Costs
Train set
MetaCost roc-auc: 0.9673530301351858
Test set
MetaCost roc-auc: 0.973641118904734


In [29]:
y_train.reset_index(drop=True)

0        0
1        0
2        0
3        0
4        0
        ..
34995    0
34996    0
34997    0
34998    0
34999    0
Name: Class, Length: 35000, dtype: int64

In [30]:
tmp = pd.concat([metacost2.y_, y_train.reset_index(drop=True)], axis=1)

tmp.head()

Unnamed: 0,0,Class
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [31]:
tmp[tmp[0]!=tmp['Class']][['Class', 0]]

Unnamed: 0,Class,0
15,0,1
259,0,1
510,1,0
528,1,0
532,1,0
...,...,...
34372,0,1
34584,1,0
34654,0,1
34976,0,1


In [32]:
np.sum( np.where(metacost2.y_ != y_train.reset_index(drop=True),1,0) )

227

In [33]:
np.sum( np.where(metacost2.y_ == y_train.reset_index(drop=True),1,0) )

34773