# Credit Card Fraud

Develop an algorithm to predict fraud. Prioritize correctly finding fraud rather than correctly labeling non-fraudulent transactions.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

raw_data = pd.read_csv('./data/creditcard.csv')
raw_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Data Exploration

In [2]:
frauds = raw_data[raw_data['Class'] == 1]
okays = raw_data[raw_data['Class'] == 0]

baseline = 1 - len(frauds) / (len(frauds) + len(okays))
print('Frauds:', len(frauds))
print('Not Frauds:', len(okays))
print('Baseline R-squared:', baseline)

Frauds: 492
Not Frauds: 284315
Baseline R-squared: 0.9982725143693799


In [3]:
# plt.figure(figsize=(20,10))

# plt.subplot(2, 2, 1)
# plt.title('Fraud Amounts')
# plt.hist(frauds['Amount'], bins=20, label='Fraud', color='blue')

# plt.subplot(2, 2, 2)
# plt.title('Okay Amounts')
# plt.hist(okays['Amount'], bins=20, label='Okay', color='orange')

# plt.subplot(2, 2, 3)
# plt.title('Fraud Times')
# plt.hist(frauds['Time'], bins=20, label='Fraud', color='blue')

# plt.subplot(2, 2, 4)
# plt.title('Okay Times')
# plt.hist(okays['Time'], bins=20, label='Okay', color='orange')

# plt.show()

In [4]:
# # TODO: 
# # Try taking out outliers – see distribution of remaining amounts
# # Compare model with / without outliers

# plt.figure(figsize=(20,10))

# plt.subplot(2, 2, 1)
# plt.title('Fraud Amounts')
# plt.boxplot(frauds['Amount'])

# plt.subplot(2, 2, 2)
# plt.title('Okay Amounts')
# plt.boxplot(okays['Amount'])

# plt.subplot(2, 2, 3)
# plt.title('Fraud Times')
# plt.boxplot(frauds['Time'])

# plt.subplot(2, 2, 4)
# plt.title('Okay Times')
# plt.boxplot(okays['Time'])

# plt.show()

In [5]:
# plt.figure(figsize=(20,60))

# for i in range(28):
#     col_name = 'V' + str(i + 1)
    
#     plt.subplot(28, 2, 2 * i + 1)
#     plt.title('Frauds ' + col_name)
#     plt.hist(frauds[col_name], bins=20, color='blue')
    
#     plt.subplot(28, 2, 2 * i + 2)
#     plt.title('Okays ' + col_name)
#     plt.hist(okays[col_name], bins=20, color='orange')
    
# plt.show()

In [6]:
# plt.figure(figsize=(20,90))

# for i in range(28):
#     col_name = 'V' + str(i + 1)
    
#     plt.subplot(28, 2, 2 * i + 1)
#     plt.title('Frauds ' + col_name)
#     plt.boxplot(frauds[col_name])
    
#     plt.subplot(28, 2, 2 * i + 2)
#     plt.title('Okays ' + col_name)
#     plt.boxplot(okays[col_name])
    
# plt.show()

I am looking to minimize Type II errors ("false negative"s – i.e. considering something that is a fraud as okay). But before I begin to build models, I need to make sure my data is not skewed by either undersampling my majority group (okays) or oversampling my minority group (frauds). 

## Helper Methods

In [7]:
# Helper method for fitting and training model, returns formatted result
def fit_and_train(model, fit_X_train, fit_Y_train, X_train, Y_train):
    model_fit = model.fit(fit_X_train, fit_Y_train)
    model_score_train = model.score(X_train, Y_train)
    print('R² for train:', model_score_train)
    
    model_score_test = model.score(X_test, Y_test)
    print('\nR² for test:', model_score_test)
    
    model_improve_over_baseline = (model_score_test - baseline) / baseline
    print('Improvement over baseline:', model_improve_over_baseline)
    
#     if hasattr(model_fit, 'coef_'):
#         print('\nCoefficients:', model_fit.coef_)
    
#     if hasattr(model_fit, 'intercept_'):
#         print('\nIntercept:', model_fit.intercept_)
    
#     if hasattr(X_train, 'columns'):
#         print('Data cols:', list(X_train.columns))

# Helper method for evaluating model, returns formatted result
def evaluate_model_printout(model, train, test):
    Y_train_vals = train['Class'].values
    Y_test_vals = test['Class'].values
    
    predict_train = model.predict_proba(X_train)
    predict_train = list(map(lambda x: 0 if x[0] > .998 else 1, predict_train))
    predict_train = np.fromiter(predict_train, dtype=np.int)

    predict_test = model.predict_proba(X_test)
    predict_test = list(map(lambda x: 0 if x[0] > .998 else 1, predict_test))
    predict_test = np.fromiter(predict_test, dtype=np.int)
    
    predict_whole = model.predict_proba(X)
    predict_whole = list(map(lambda x: 0 if x[0] > .998 else 1, predict_whole))
    predict_whole = np.fromiter(predict_whole, dtype=np.int)
    
    crosstab_labels = [0, 1, 'All']
    table_train = pd.crosstab(Y_train_vals, predict_train, rownames=['actual'], colnames=['predicted'], margins=True)
    table_train = table_train.reindex(index=crosstab_labels,columns=crosstab_labels, fill_value=0)

    print('TRAIN:')
#     print(table_train, '\n')

    train_tI_errors = table_train.loc[0,1] / table_train.loc['All','All']
    train_tII_errors = table_train.loc[1,0] / table_train.loc['All','All']
    print(('Accuracy:\n% Type I errors: {}\n% Type II errors: {}\n').format(train_tI_errors, train_tII_errors))

    train_precision = table_train.loc[1,1] / table_train.loc['All', 1] # correctly predicted positives / all predicted positives
    train_recall = table_train.loc[1,1] / table_train.loc[1,'All'] # true positives / (true positives + false negatives)
    print('Precision:', train_precision)
    print('Recall:', train_recall, '\n\n----------\n')
    
    print('TEST:')
#     print(table_test, '\n')

    table_test = pd.crosstab(Y_test_vals, predict_test, rownames=['actual'], colnames=['predicted'], margins=True)
    table_test = table_test.reindex(index=crosstab_labels,columns=crosstab_labels, fill_value=0)

    test_tI_errors = table_test.loc[0,1]/table_test.loc['All','All']
    test_tII_errors = table_test.loc[1,0]/table_test.loc['All','All']
    print(('Accuracy:\n% Type I errors: {}\n% Type II errors: {}\n').format(test_tI_errors, test_tII_errors))

    test_precision = table_test.loc[1,1] / table_test.loc['All', 1] # correctly predicted positives / all predicted positives
    test_recall = table_test.loc[1,1] / table_test.loc[1,'All'] # true positives / (true positives + false negatives)
    print('Precision:', test_precision)
    print('Recall:', test_recall)
    
    print('WHOLE DATASET:')
    table_whole = pd.crosstab(Y, predict_whole, rownames=['actual'], colnames=['predicted'], margins=True)
    table_whole = table_whole.reindex(index=crosstab_labels,columns=crosstab_labels, fill_value=0)
    
    whole_tI_errors = table_whole.loc[0,1]/table_whole.loc['All','All']
    whole_tII_errors = table_whole.loc[1,0]/table_whole.loc['All','All']
    print(('Accuracy:\n% Type I errors: {}\n% Type II errors: {}\n').format(whole_tI_errors, whole_tII_errors))

    whole_precision = table_whole.loc[1,1] / table_whole.loc['All', 1] # correctly predicted positives / all predicted positives
    whole_recall = table_whole.loc[1,1] / table_whole.loc[1,'All'] # true positives / (true positives + false negatives)
    print('Precision:', whole_precision)
    print('Recall:', whole_recall)

## STRATEGY 1: Undersampling Okays

In [8]:
print('Randomly sample', len(frauds), '(# of frauds) rows from okays (3 times)')
all_undersampled_okays = []

for time in range(3):
    rand_sample = okays.sample(n=len(frauds))
    all_undersampled_okays.append(rand_sample)

for idx, sample in enumerate(all_undersampled_okays):
    df = pd.concat([frauds, sample])
    df = df.reset_index()
    df_test = df.iloc[::2]
    df_train = df.iloc[1::2]

    # TRAINING
    X_train = df_train.loc[:, ~(df_train.columns).isin(['Class', 'index', 'Time', 'Amount'])]
    Y_train = df_train['Class'].values.reshape(-1, 1)

    # TESTING
    X_test = df_test.loc[:, ~(df_train.columns).isin(['Class', 'index', 'Time', 'Amount'])]
    Y_test = df_test['Class'].values.reshape(-1, 1)
    
    # WHOLE DATASET
    X = df.loc[:, ~(df.columns).isin(['Class', 'index', 'Time', 'Amount'])]
    Y = df['Class'].values

#     # LASSO
#     lasso = linear_model.LogisticRegression(penalty='l1', C=100) 
#     fit_and_train(lasso, X_train, Y_train, X_train, Y_train)
#     evaluate_model_printout(lasso)
    
#     # RIDGE
#     ridge = linear_model.LogisticRegression(penalty='l2', C=100, fit_intercept=False)
#     fit_and_train(ridge, X_train, Y_train, X_train, Y_train)
#     evaluate_model_printout(ridge)
    
#     # SVC
#     svm = SVC(kernel='linear', probability=True)
#     fit_and_train(svm, X_train, Y_train, X_train, Y_train)

#     # Naive Bayes
#     bnb = BernoulliNB()
#     fit_and_train(bnb, X_train, Y_train, X_train, Y_train)

    print('*** GRADIENT BOOSTING CLASSIFIER', idx + 1, '***')
    gbm = ensemble.GradientBoostingClassifier(n_estimators=500, max_depth=2, loss='deviance')
    fit_and_train(gbm, X_train, Y_train, X_train, Y_train)
    evaluate_model_printout(gbm, df_train, df_test)
    
    print('*** RANDOM FOREST', idx + 1, '***')
    rfc = ensemble.RandomForestClassifier()
    fit_and_train(rfc, X_train, Y_train, X_train, Y_train)
    evaluate_model_printout(rfc, df_train, df_test)

Randomly sample 492 (# of frauds) rows from okays (3 times)
*** GRADIENT BOOSTING CLASSIFIER 1 ***


  y = column_or_1d(y, warn=True)


R² for train: 1.0

R² for test: 0.9451219512195121
Improvement over baseline: -0.05324253887070464
TRAIN:
Accuracy:
% Type I errors: 0.20934959349593496
% Type II errors: 0.0

Precision: 0.7048710601719198
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.25
% Type II errors: 0.006097560975609756

Precision: 0.6639344262295082
Recall: 0.9878048780487805
WHOLE DATASET:
Accuracy:
% Type I errors: 0.22967479674796748
% Type II errors: 0.003048780487804878

Precision: 0.6839160839160839
Recall: 0.9939024390243902
*** RANDOM FOREST 1 ***
R² for train: 0.9878048780487805

R² for test: 0.9369918699186992
Improvement over baseline: -0.061386689073967333
TRAIN:
Accuracy:
% Type I errors: 0.16666666666666666
% Type II errors: 0.0

Precision: 0.75
Recall: 1.0 

----------

TEST:


  This is separate from the ipykernel package so we can avoid doing imports until
  y = column_or_1d(y, warn=True)


Accuracy:
% Type I errors: 0.258130081300813
% Type II errors: 0.006097560975609756

Precision: 0.6567567567567567
Recall: 0.9878048780487805
WHOLE DATASET:
Accuracy:
% Type I errors: 0.21239837398373984
% Type II errors: 0.003048780487804878

Precision: 0.7005730659025788
Recall: 0.9939024390243902
*** GRADIENT BOOSTING CLASSIFIER 2 ***
R² for train: 1.0

R² for test: 0.9349593495934959
Improvement over baseline: -0.06342272662478306
TRAIN:
Accuracy:
% Type I errors: 0.10772357723577236
% Type II errors: 0.0

Precision: 0.822742474916388
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.18699186991869918
% Type II errors: 0.006097560975609756

Precision: 0.7253731343283583
Recall: 0.9878048780487805
WHOLE DATASET:
Accuracy:
% Type I errors: 0.14735772357723578
% Type II errors: 0.003048780487804878

Precision: 0.7712933753943217
Recall: 0.9939024390243902
*** RANDOM FOREST 2 ***
R² for train: 0.9857723577235772

R² for test: 0.9369918699186992
Improvement over baseline: -0.

  This is separate from the ipykernel package so we can avoid doing imports until
  y = column_or_1d(y, warn=True)


Accuracy:
% Type I errors: 0.2184959349593496
% Type II errors: 0.003048780487804878

Precision: 0.6946022727272727
Recall: 0.9939024390243902
*** GRADIENT BOOSTING CLASSIFIER 3 ***
R² for train: 1.0

R² for test: 0.9329268292682927
Improvement over baseline: -0.06545876417559868
TRAIN:
Accuracy:
% Type I errors: 0.10569105691056911
% Type II errors: 0.0

Precision: 0.825503355704698
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.21138211382113822
% Type II errors: 0.006097560975609756

Precision: 0.7002881844380403
Recall: 0.9878048780487805
WHOLE DATASET:
Accuracy:
% Type I errors: 0.15853658536585366
% Type II errors: 0.003048780487804878

Precision: 0.7581395348837209
Recall: 0.9939024390243902
*** RANDOM FOREST 3 ***
R² for train: 0.9979674796747967

R² for test: 0.9390243902439024
Improvement over baseline: -0.059350651523151714
TRAIN:
Accuracy:
% Type I errors: 0.1402439024390244
% Type II errors: 0.0

Precision: 0.780952380952381
Recall: 1.0 

----------

TEST:
Ac

  This is separate from the ipykernel package so we can avoid doing imports until


I am looking to minimize Type II errors ("false negative"s – i.e. considering something that is a fraud as okay). My test set gave 0.0% for Type II errors, which I am very happy with, especially since my percentage of Type I errors is also extremely low (~0.000035%).

In [9]:
# NOTE: Could use ROC / AUC threshold for less performant classifiers

In [10]:
# PCA on models that work to reduce 
# Find out if multicollinearity using LASSO – remove features to improve

## STRATEGY 2: Over-Sampling Frauds

In [11]:
df_test = raw_data.iloc[::2]
df_train = raw_data.iloc[1::2]

X_train = df_train.loc[:, ~(df_train.columns).isin(['Class'])]
Y_train = df_train['Class'].values.reshape(-1, 1)

# TESTING
X_test = df_test.loc[:, ~(df_test.columns).isin(['Class'])]
Y_test = df_test['Class'].values.reshape(-1, 1)

sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, Y_train_res = sm.fit_sample(X_train, Y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
oversampled_df = pd.DataFrame(X_train_res, columns=df.columns[1:-1])
oversampled_df['Class'] = Y_train_res
oversampled_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
1,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
2,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
3,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
4,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [13]:
oversampled_train = oversampled_df.iloc[::2]
oversampled_test = oversampled_df.iloc[1::2]

X_train = oversampled_train.loc[:, ~(oversampled_train.columns).isin(['Class'])]
Y_train = oversampled_train['Class'].values.reshape(-1, 1)

X_test = oversampled_test.loc[:, ~(oversampled_test.columns).isin(['Class'])]
Y_test = oversampled_test['Class'].values.reshape(-1, 1)

X = oversampled_df.loc[:, ~(oversampled_df.columns).isin(['Class'])]
Y = oversampled_df['Class'].values

### LASSO Logistic Regression

In [14]:
lasso = linear_model.LogisticRegression(penalty='l1', C=100) 
fit_and_train(lasso, X_train, Y_train, X_test, Y_test)

  y = column_or_1d(y, warn=True)


R² for train: 0.9778070089053333

R² for test: 0.9778070089053333
Improvement over baseline: -0.020500920509641463


#### Evaluating LASSO Logistic Regression

In [15]:
evaluate_model_printout(lasso, oversampled_train, oversampled_test)

TRAIN:
Accuracy:
% Type I errors: 0.4450064011479861
% Type II errors: 0.0

Precision: 0.5291039554582943
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.44400050646445605
% Type II errors: 0.0

Precision: 0.5296537232021103
Recall: 1.0
WHOLE DATASET:
Accuracy:
% Type I errors: 0.4445034538062211
% Type II errors: 0.0

Precision: 0.52937868886034
Recall: 1.0


### Ridge Logistic Regression

In [16]:
ridge = linear_model.LogisticRegression(penalty='l2', C=100, fit_intercept=False)
fit_and_train(ridge, X_train, Y_train, X_test, Y_test)

  y = column_or_1d(y, warn=True)


R² for train: 0.961100716084467

R² for test: 0.961100716084467
Improvement over baseline: -0.03723612315259902


#### Evaluating Ridge Logistic Regression

In [17]:
evaluate_model_printout(ridge, oversampled_train, oversampled_test)

TRAIN:
Accuracy:
% Type I errors: 0.4999859315428877
% Type II errors: 0.0

Precision: 0.5000140684571123
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.5000140684571123
% Type II errors: 0.0

Precision: 0.4999859315428877
Recall: 1.0
WHOLE DATASET:
Accuracy:
% Type I errors: 0.5
% Type II errors: 0.0

Precision: 0.5
Recall: 1.0


### Gradient Boosting Classifier

In [18]:
gbm = ensemble.GradientBoostingClassifier(n_estimators=500, max_depth=2, loss='deviance')
fit_and_train(gbm, X_train, Y_train, X_test, Y_test)

  y = column_or_1d(y, warn=True)


R² for train: 0.9986001885173253

R² for test: 0.9986001885173253
Improvement over baseline: 0.0003282411798670318


#### Evaluating Gradient Boosting Classifier

In [19]:
evaluate_model_printout(gbm, oversampled_train, oversampled_test)

TRAIN:
Accuracy:
% Type I errors: 0.2208185028347941
% Type II errors: 0.0

Precision: 0.6936618687484752
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.21974930009425866
% Type II errors: 7.0342285561542466e-06

Precision: 0.6946773783693974
Recall: 0.9999859311470336
WHOLE DATASET:
Accuracy:
% Type I errors: 0.2202839014645264
% Type II errors: 3.5171142780771233e-06

Precision: 0.6941692343000004
Recall: 0.9999929657714438


### <s>Support Vector Machine</s>

In [20]:
# svm = SVC(kernel='linear', probability=True)
# fit_and_train(svm, X_train, Y_train, X_test, Y_test)

#### Evaluating Support Vector Machine

In [21]:
# evaluate_model_printout(svm, oversampled_train, oversampled_test)

### Random Forest Classifier

In [22]:
rfc = ensemble.RandomForestClassifier()
fit_and_train(rfc, X_train, Y_train, X_test, Y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


R² for train: 0.999866349657433

R² for test: 0.999866349657433
Improvement over baseline: 0.0015965933801753208


#### Evaluating Random Forest Classifier

In [23]:
evaluate_model_printout(rfc, oversampled_train, oversampled_test)

TRAIN:
Accuracy:
% Type I errors: 0.008638032666957415
% Type II errors: 0.0

Precision: 0.9830177981220009
Recall: 1.0 

----------

TEST:
Accuracy:
% Type I errors: 0.020582152755307326
% Type II errors: 0.0

Precision: 0.9604621309370989
Recall: 1.0
WHOLE DATASET:
Accuracy:
% Type I errors: 0.01461009271113237
% Type II errors: 0.0

Precision: 0.9716093933677793
Recall: 1.0


### Naive Bayes

In [24]:
bnb = BernoulliNB()
fit_and_train(bnb, X_train, Y_train, X_test, Y_test)

  y = column_or_1d(y, warn=True)


R² for train: 0.9448235112055261

R² for test: 0.9448235112055261
Improvement over baseline: -0.053541495327674285


#### Evaluating Naive Bayes

In [25]:
evaluate_model_printout(bnb, oversampled_train, oversampled_test)

TRAIN:
Accuracy:
% Type I errors: 0.0340386319832304
% Type II errors: 0.024802689888999873

Precision: 0.9331592905685397
Recall: 0.9503960159250454 

----------

TEST:
Accuracy:
% Type I errors: 0.033532167527187295
% Type II errors: 0.02408519857627214

Precision: 0.9341774598878794
Recall: 0.951828247442986
WHOLE DATASET:
Accuracy:
% Type I errors: 0.03378539975520885
% Type II errors: 0.024443944232636006

Precision: 0.9336684666270767
Recall: 0.951112111534728


## Conclusion

Oversampling the minority class (frauds) proved a better strategy to deal with the skewed dataset than undersampling the majority class (not frauds), as the models built after oversampling were more accurate.

Of all the classifiers built using oversampling, the Random Forest Classifier was the most performant, as it 1) minimized errors, especially Type II errors (Type I - 1.46%, Type II - 0%), and 2) showed the highest precision (97.16%) and recall (100%) rate. For a problem like this, we are more concerned with a high recall rate than precision rate, though both are important. 

In [26]:
# MENTOR NOTES:
# Precision – % of my + predictions that are correct
# Recall – % of my target +s I predicted

# CURRICULUM NOTES:
# There are a few things you can do to deal with class imbalance:

# Ignore. If we really only care about the absolute accuracy of the model and our sample is representative of the population, 
# this can be a reasonable strategy. Engineer features that strongly identify the minority class, and this can turn out ok.

# Change your sampling. If you oversample the minority class or undersample the majority class, you can create a more balanced training set. 
# This is particularly useful if the goal of your model is to correctly identify the minority class. This can also be done by creating synthetic samples 
# to try to make your data more balanced or weighting samples to balance out your classes. 

# Probability outputs. 
# Although Naive Bayes' probability outputs are generally inaccurate, other models will give you a more accurate probability of a certain class. 
# e.g. logistic regression or support vector machines (SVM)
# Instead of just taking the most likely outcome, you can set up a specific cutoff or a more complex rule. 
# In the binary case, it could be going with the minority case if it has a priority greater than some threshold.

# Lastly, you can create cost functions for errors. This quantifies ways in which errors are not equal – scale the cost of an error up or down. 
# This can mean something like a Type II error being twice as bad as a Type I error, or however you choose to quantify that relationship. 
# SKLearn's Naive Bayes model does not have an easy built-in way to do this, but it's a good thing to keep in mind.