In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.options.display.max_rows = 3000


def my_remove_highly_correlated(X,threshold):

    X = X.select_dtypes(include=[np.number])  #.dropna()
    
    cols = X.columns

    #run correlation matrix
    df = X.corr()
   
    #put df into array
    a = df.values
    
    #label top half with -99999 
    #we want to ignore top half of matrix
    iu1 = np.triu_indices(len(df))
    a[iu1] = -99999
    #put data back into dataframe
    df = pd.DataFrame(a, columns=cols)
    df['var'] = cols
    
    #unstack to get a list of var1, var2, correlation
    df = pd.melt(df, id_vars=['var'])
        
    #remove those flagged with -99999
    df = df[df.value != -99999].sort_values(by='var', ascending=True)
  
    #flag remove vs keep based on corr threshold
    df_remove = df[df.value > threshold]
    remove_list = df_remove['var'].unique()
 
    print('{} out of {} vars removed due to corr greater than {}'.
          format(df_remove.shape[0],X.shape[1],threshold))
    
    new_df = X.drop(columns=remove_list, axis=1)
    
    print(df_remove)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(new_df.shape))
    print('\n')
    return new_df

def my_drop_na_columns(X,NANthreshold):

    df = X
    colcount = df.shape[1]
    #Get count of NA in each column
    series_cols = df.isnull().sum(axis = 0).sort_values(ascending=False)

    #filter the list to include only those with counts above threshold
    series_cols_remove = series_cols[series_cols.values >= NANthreshold]
    series_cols_keep = series_cols[(series_cols.values < NANthreshold) & (series_cols.values > 0) ]
    
    #put the to-remove column names in a list
    list_colstodrop = series_cols_remove.index.tolist()

    #drop the columns
    df = df.drop(labels = list_colstodrop, axis=1)

    #print the results
    print('Dropped {} of {} Columns - containing more than {} NANs\n'.format(
          series_cols_remove.shape[0],colcount,NANthreshold))
    print(series_cols_remove)
    
    print('\n{} Columns remain with NANs\n'.format(series_cols_keep.shape[0]))
    print(series_cols_keep)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df.shape))
    print('\n')    
    return df


def my_feature_selector(mytype,X,y=[],threshold=None,k=None):  
    
    #reduces # of features in given x dataset by one of 3 ways:
    #1) selectkbest  2) rfecv (recursive feature elim)  3) variancethreshold

    mytype = str.lower(mytype)
    
    #Retain original column names
    orig_cols = X.columns
    
    #Evaluate type and instatiate object
    if mytype == 'selectkbest':
        if k == None:
            print('Try Again: To run selectKBest you must pass a k for number of features to select')
            return X  
        if len(y) == 0:
            print('Try Again: To run selectKBest you must pass a y vector reflecting outcomes')
            return X             
        
        selector = SelectKBest(chi2, k=k)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.pvalues_, keep_list]
        score_type = 'PValue'
        
    if mytype == 'rfecv':
        if len(y) == 0:
            print('Try Again: To run rfecv you must pass a y vector reflecting outcomes')
            return X 
       
        estimator = SVR(kernel="linear")
        selector = RFECV(estimator, n_jobs=-1)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.ranking_, keep_list]
        score_type = 'Ranking'
            
    if mytype == 'variancethreshold':
        if threshold == None:
            print('Try Again: To run VarianceThreshold you must pass a threshold value from 0 to 1. (0 returns all)')
            return X     
    
        selector = VarianceThreshold(threshold=threshold)
        df_new = selector.fit_transform(X)
        keep_list = selector.get_support()   
        scores = [orig_cols, selector.variances_, keep_list]
        score_type = 'Variance'
        
    if (mytype != 'selectkbest') & (mytype != 'rfecv') & (mytype != 'variancethreshold'):  
        print('Try Again: Type must be passed as selectkbest, rfecv, or variancethreshold')
        return X  
    
    
    #Print Scores
    print('---- Running Feature Selection Method: ' + mytype + '-------\n')
    scores_df = pd.DataFrame(scores)
    scores_df = scores_df.transpose()
    scores_df.columns= ['Feature',score_type,'Keep']
    print(scores_df.sort_values(by=score_type, ascending=True))

    #List features that were removed vs Kept
    i = 0
    keep = []

    for item in keep_list:
        col_name = orig_cols[i]
        if item==True:
            keep.append(col_name)
        i = i + 1

    #Place resultset of kept features into DataFrame with correct column headers
    df_keep = pd.DataFrame(df_new, columns=keep)
    
    #Print Stats
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df_keep.shape))
    print('\n')
    return df_keep

def my_confusion_matrix(array_Expected,array_Predicted,colName):
    a = np.array(confusion_matrix(array_Expected, array_Predicted ))
    totalExpectedFalse = a[0,0] + a[0,1]
    totalExpectedTrue = a[1,0] + a[1,1]
    correctFalse = a[0,0] 
    correctTrue = a[1,1] 
    correctTruePct = np.round(correctTrue / totalExpectedTrue,3)
    correctFalsePct = np.round(correctFalse / totalExpectedFalse,3)
    print('Regarding {}, the model correctly predicted {} Negatives out of {} expected Negatives: {}'.format(
        colName,correctFalse,totalExpectedFalse,correctFalsePct))
    print('Regarding {}, the model correctly predicted {} Positives out of {} expected Positives: {}'.format(
        colName,correctTrue,totalExpectedTrue,correctTruePct))    
    print(a)

def my_minmax_scaler(df, min_val, max_val):
    #Take in a dataframe and return a dataframe scaled with min 0, max 1
    print('------Scaling Data to Min {}, Max {}------\n'.format(min_val,max_val))
    # Save the column names.
    names=df.columns
    
    #instatiate scaler object
    #you can use StandardScaler instead to scale with mean 0 and std 1
    scaler = MinMaxScaler(feature_range=(min_val,max_val), copy=True)
    
    # Scale, then turn the resulting numpy array back into a data frame with the
    # correct column names.
    scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df), columns=names)
    print('Scaling Complete')
    return df_scaled


In [2]:
%%time
raw_data = pd.read_csv('creditcard.csv')
raw_data.head()


CPU times: user 2.23 s, sys: 138 ms, total: 2.37 s
Wall time: 2.41 s


In [3]:
%%time
df = my_remove_highly_correlated(X=raw_data, threshold=.80)

y = df.Class
X = df.drop(columns=['Class','Time'], axis=1)


#X = my_minmax_scaler(X, 0, 1)

print(y.shape)
print(X.shape)

#Split data into folds
from sklearn.model_selection import StratifiedShuffleSplit
skf = StratifiedShuffleSplit(n_splits=5, test_size=.2)
skf.get_n_splits(X, y)


#X = my_feature_selector(mytype='selectkbest', X=X, y=Y, k=10)

0 out of 31 vars removed due to corr greater than 0.8
Empty DataFrame
Columns: [var, variable, value]
Index: []

Shape before: (284807, 31)
Shape after: (284807, 31)


(284807,)
(284807, 29)
CPU times: user 457 ms, sys: 58.3 ms, total: 515 ms
Wall time: 513 ms


In [4]:
%%time
from sklearn import ensemble


model = ensemble.RandomForestClassifier(n_estimators=150, max_depth=10, max_features=2 )

i = 1
print ('---------------------------------------')
print ('---------- Random Forest --------------')
print ('---------------------------------------')
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('---------- Strata # ' + str(i) + '--------------')
    print('Test Set Accuracy: ' + str(model.score(X_test, y_test)))
    my_confusion_matrix(y_test, y_pred, 'fraud')
    i = i + 1

    

---------------------------------------
---------- Random Forest --------------
---------------------------------------


  from numpy.core.umath_tests import inner1d


---------- Strata # 1--------------
Test Set Accuracy: 0.9995259997893332
Regarding fraud, the model correctly predicted 56862 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 73 Positives out of 98 expected Positives: 0.745
[[56862     2]
 [   25    73]]
---------- Strata # 2--------------
Test Set Accuracy: 0.999403110845827
Regarding fraud, the model correctly predicted 56862 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 66 Positives out of 98 expected Positives: 0.673
[[56862     2]
 [   32    66]]
---------- Strata # 3--------------
Test Set Accuracy: 0.9994908886626171
Regarding fraud, the model correctly predicted 56860 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 73 Positives out of 98 expected Positives: 0.745
[[56860     4]
 [   25    73]]
---------- Strata # 4--------------
Test Set Accuracy: 0.9994908886626171
Regarding fraud, the mod

In [5]:
%%time
#'----------- K Nearest Neighbor --------------
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=10)

i = 1
print ('---------------------------------------')
print ('---------- K Nearest Neighbor --------------')
print ('---------------------------------------')
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('---------- Strata # ' + str(i) + '--------------')
    my_confusion_matrix(y_test, y_pred, 'fraud')
    i = i + 1

---------------------------------------
---------- K Nearest Neighbor --------------
---------------------------------------
---------- Strata # 1--------------
Regarding fraud, the model correctly predicted 56856 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 55 Positives out of 98 expected Positives: 0.561
[[56856     8]
 [   43    55]]
---------- Strata # 2--------------
Regarding fraud, the model correctly predicted 56861 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 56 Positives out of 98 expected Positives: 0.571
[[56861     3]
 [   42    56]]
---------- Strata # 3--------------
Regarding fraud, the model correctly predicted 56858 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 59 Positives out of 98 expected Positives: 0.602
[[56858     6]
 [   39    59]]
---------- Strata # 4--------------
Regarding fraud, the model correctly predicted 568

In [6]:
%%time
#'----------- Logistic Regression --------------
from sklearn import linear_model 
model = linear_model.LogisticRegression(penalty='l2', C=7000 )

i = 1
print ('---------------------------------------')
print ('---------- Logistic Regression --------------')
print ('---------------------------------------')
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('---------- Strata # ' + str(i) + '--------------')
    my_confusion_matrix(y_test, y_pred, 'fraud')
    i = i + 1

---------------------------------------
---------- Logistic Regression --------------
---------------------------------------
---------- Strata # 1--------------
Regarding fraud, the model correctly predicted 56852 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 61 Positives out of 98 expected Positives: 0.622
[[56852    12]
 [   37    61]]
---------- Strata # 2--------------
Regarding fraud, the model correctly predicted 56847 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 73 Positives out of 98 expected Positives: 0.745
[[56847    17]
 [   25    73]]
---------- Strata # 3--------------
Regarding fraud, the model correctly predicted 56849 Negatives out of 56864 expected Negatives: 1.0
Regarding fraud, the model correctly predicted 69 Positives out of 98 expected Positives: 0.704
[[56849    15]
 [   29    69]]
---------- Strata # 4--------------
Regarding fraud, the model correctly predicted 56