In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler
    )
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

To get the classifiers, let's find all the estimators that have probabilities as part of their prediction:

In [3]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)



AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
CategoricalNB
ClassifierChain
ComplementNB
DecisionTreeClassifier
DummyClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
GridSearchCV
HistGradientBoostingClassifier
KNeighborsClassifier
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultiOutputClassifier
MultinomialNB
NoSampleWeightWrapper
NuSVC
OneVsRestClassifier
Pipeline
QuadraticDiscriminantAnalysis
RFE
RFECV
RadiusNeighborsClassifier
RandomForestClassifier
RandomizedSearchCV
SGDClassifier
SVC
StackingClassifier
VotingClassifier
_BinaryGaussianProcessClassifierLaplace
_ConstantPredictor


Okay, not all of those are classifiers. But good enough. We'll test on all that are interesting for our case.  
Special case: Classifiers with feature importances for specific methods.

In [4]:
from sklearn.utils.testing import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba') and (hasattr(class_, 'coef_') or hasattr(class_, 'feature_importances_')):
        print(name)

AdaBoostClassifier
BernoulliNB
CategoricalNB
ComplementNB
DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GradientBoostingClassifier
MultinomialNB
NuSVC
OneVsRestClassifier
RandomForestClassifier
SVC


Let's collect all those estimators into one dictionary for easy access.

In [6]:
estimator_dict = dict(estimators)

## Preprocessing

In [7]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## No imputation

We throw away the NaN values and encode the data like the other baselines.

In [8]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [9]:
len(quake_frame)

1227408

In [10]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [11]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame = pd.concat([quake_frame,
                             pd.get_dummies(quake_frame[cat], prefix=cat)],
                            axis=1)

scale_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError',
 'depthError', 'magError', 'magNst']

scaler = MinMaxScaler()

quake_frame[scale_cols] = scaler.fit_transform(quake_frame[scale_cols])

x_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError', 'depthError',
 'magError', 'magNst', 'magType_Mb', 'magType_Md', 'magType_Ml', 'magType_Unknown', 'magType_ma', 'magType_mb',
 'magType_mc', 'magType_md', 'magType_me', 'magType_mh', 'magType_ml', 'magType_mlg', 'magType_mlr', 'magType_mw',
 'net_av', 'net_ci', 'net_hv', 'net_ismpkansas', 'net_ld', 'net_mb', 'net_nc', 'net_nm', 'net_nn', 'net_pr',
 'net_se', 'net_uu', 'net_uw', 'status_automatic', 'status_manual', 'status_reviewed', 'locationSource_av',
 'locationSource_ci', 'locationSource_hv', 'locationSource_ismp', 'locationSource_ld', 'locationSource_mb',
 'locationSource_nc', 'locationSource_nm', 'locationSource_nn', 'locationSource_pr', 'locationSource_se',
 'locationSource_uu', 'locationSource_uw', 'magSource_av', 'magSource_ci', 'magSource_hv', 'magSource_ismp',
 'magSource_ld', 'magSource_mb', 'magSource_nc', 'magSource_nm', 'magSource_nn', 'magSource_pr', 'magSource_se',
 'magSource_uu', 'magSource_uw']

y_col = ['simple_label']

In [12]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [13]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

## Try AdaBoostClassifier

In [19]:
clf = estimator_dict['AdaBoostClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

In [20]:
preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.7889921372408863
Recall:  0.6336394948335247
ROC score:  0.81370282480767
F1 score:  0.7028334925183063
Accuracy score:  0.9809884227764154


## Try BaggingClassifier

In [21]:
clf = estimator_dict['BaggingClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9568292682926829
Recall:  0.9008036739380023
ROC score:  0.9496542823595033
F1 score:  0.9279716144293317
Accuracy score:  0.995038332749448


## Try BayesianGaussianMixture

In [22]:
clf = estimator_dict['BayesianGaussianMixture'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.0
Recall:  0.0
ROC score:  0.5
F1 score:  0.0
Accuracy score:  0.9645187834545914


## Try BernoulliNB

In [24]:
clf = estimator_dict['BernoulliNB']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.14831373311303334
Recall:  0.3504018369690011
ROC score:  0.6381906301058072
F1 score:  0.20841300191204587
Accuracy score:  0.9055572302653555


## Try CalibratedClassifierCV

In [26]:
clf = estimator_dict['CalibratedClassifierCV']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.748072515107314
Recall:  0.4121699196326062
ROC score:  0.7035318707686116
F1 score:  0.5314975201717373
Accuracy score:  0.9742180689419183


## Try ComplementNB

In [31]:
clf = estimator_dict['ComplementNB']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.07144813607603605
Recall:  0.5989667049368542
ROC score:  0.6563055273877587
F1 score:  0.12766738449490994
Accuracy score:  0.7095754474869848


## Try DecisionTreeClassifier

In [32]:
clf = estimator_dict['DecisionTreeClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8955750199067228
Recall:  0.9039035591274397
ROC score:  0.9500132057458697
F1 score:  0.8997200159990858
Accuracy score:  0.9928507996512983


## Try DummyClassifier

In [33]:
clf = estimator_dict['DummyClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)



Precision:  0.03354779411764706
Recall:  0.033524684270952926
ROC score:  0.4989984173470724
F1 score:  0.03353623521304698
Accuracy score:  0.9314410017842448


## Try ExtraTreeClassifier

In [34]:
clf = estimator_dict['ExtraTreeClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.7914396003633061
Recall:  0.8003444316877153
ROC score:  0.8962929564719724
F1 score:  0.7958671081173649
Accuracy score:  0.9854327404860641


## Try ExtraTreesClassifier

In [35]:
clf = estimator_dict['ExtraTreesClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9739884393063584
Recall:  0.8512055109070035
ROC score:  0.9251846316888674
F1 score:  0.9084670996201446
Accuracy score:  0.9939140140621309


## Try GaussianMixture

In [36]:
clf = estimator_dict['GaussianMixture'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.0
Recall:  0.0
ROC score:  0.5
F1 score:  0.0
Accuracy score:  0.9645187834545914


## Try GaussianNB

In [38]:
clf = estimator_dict['GaussianNB']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.04088910187743136
Recall:  0.9991963260619977
ROC score:  0.5685062264844478
F1 score:  0.07856323966166262
Accuracy score:  0.16837894428104708


## Try GaussianProcessClassifier

In [None]:
# clf = estimator_dict['GaussianProcessClassifier'](random_state=42)

# clf.fit(train_X, np.ravel(train_y))

# preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# acc = accuracy_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)
# print("Accuracy score: ", acc)

Kernel dies when I run this one. Huh.

## Try GradientBoostingClassifier

In [11]:
clf = estimator_dict['GradientBoostingClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9057961359093938
Recall:  0.7804822043628014
ROC score:  0.8887481046985903
F1 score:  0.8384828862164664
Accuracy score:  0.9893311933257836


## Try HistGradientBoostingClassifier

In [12]:
clf = estimator_dict['HistGradientBoostingClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9213176916696291
Recall:  0.8926521239954076
ROC score:  0.9449238691708494
F1 score:  0.9067584115691878
Accuracy score:  0.9934862841267385


## Try KNeighborsClassifier

In [None]:
clf = estimator_dict['KNeighborsClassifier']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Takes several hours to run. Run overnight.

## Try LinearDiscriminantAnalysis

In [11]:
clf = estimator_dict['LinearDiscriminantAnalysis']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.3548825822830719
Recall:  0.44936854190585535
ROC score:  0.709659267996497
F1 score:  0.3965753077663509
Accuracy score:  0.9514791308527712


## Try LogisticRegression

In [19]:
clf = estimator_dict['LogisticRegression'](random_state=42, class_weight='balanced', solver='saga', max_iter=1000)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)



Precision:  0.20697580441481034
Recall:  0.9408725602755453
ROC score:  0.9041298756642707
F1 score:  0.3393093739648891
Accuracy score:  0.8699945413513007


## Try MLPClassifier

In [14]:
clf = estimator_dict['MLPClassifier'](random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8453395300800413
Recall:  0.7517795637198622
ROC score:  0.873359921910275
F1 score:  0.795819154107924
Accuracy score:  0.9863126420674428


## Try NuSVC

In [13]:
clf = estimator_dict['NuSVC'](nu=0.01, random_state=42, max_iter=10000)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)



Precision:  0.04318803939578578
Recall:  0.5452353616532721
ROC score:  0.5504376933281143
F1 score:  0.08003640316505296
Accuracy score:  0.5552708548895642


## Try OneVsRestClassifier

In [16]:
clf = estimator_dict['OneVsRestClassifier'](estimator_dict['HistGradientBoostingClassifier'](random_state=42))

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9213176916696291
Recall:  0.8926521239954076
ROC score:  0.9449238691708494
F1 score:  0.9067584115691878
Accuracy score:  0.9934862841267385


## Try QuadraticDiscriminantAnalysis

In [17]:
clf = estimator_dict['QuadraticDiscriminantAnalysis']()

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)



Precision:  0.04084362284764949
Recall:  0.9991963260619977
ROC score:  0.5680057450085976
F1 score:  0.07847928906042174
Accuracy score:  0.16741349671258993


## Try RFE

In [23]:
clf = estimator_dict['RFE'](estimator_dict['GradientBoostingClassifier'](random_state=42))

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9057961359093938
Recall:  0.7804822043628014
ROC score:  0.8887481046985903
F1 score:  0.8384828862164664
Accuracy score:  0.9893311933257836


## Try RadiusNeighborsClassifier

In [None]:
# clf = estimator_dict['RadiusNeighborsClassifier']()

# clf.fit(train_X, np.ravel(train_y))

# preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# acc = accuracy_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)
# print("Accuracy score: ", acc)

Hm. Kerneldeath.

## Try SGDClassifier

In [14]:
clf = estimator_dict['SGDClassifier'](loss='modified_huber', class_weight='balanced', average=False, random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.1865723432358011
Recall:  0.9568312284730195
ROC score:  0.9016856799537398
F1 score:  0.31225762940482205
Accuracy score:  0.850453393731516


## Try SVC

In [None]:
# clf = estimator_dict['SVC'](random_state=42)

# clf.fit(train_X, np.ravel(train_y))

# preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# acc = accuracy_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)
# print("Accuracy score: ", acc)

Yeah, nah. Dataset is a bit large for an SVM. (Says the guy who ran NuSVC anyway...)

## Try StackingClassifier

In [19]:
namestring = 'esti_'
estimator_list = [(namestring + str(num), estimator_dict['GradientBoostingClassifier'](random_state=42)) for num in range(5)]

clf = estimator_dict['StackingClassifier'](estimator_list)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9009222661396574
Recall:  0.7850746268656716
ROC score:  0.8909492878216994
F1 score:  0.8390184049079755
Accuracy score:  0.989310825233622


## Try VotingClassifier

In [20]:
namestring = 'esti_'
estimator_list = [(namestring + str(num), estimator_dict['GradientBoostingClassifier'](random_state=42)) for num in range(5)]

clf = estimator_dict['VotingClassifier'](estimator_list)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9057961359093938
Recall:  0.7804822043628014
ROC score:  0.8887481046985903
F1 score:  0.8384828862164664
Accuracy score:  0.9893311933257836


The voting classifier is obviously meant as an ensemble, so popping in multiple instances of the same algorithm isn't very useful. But this is a template to build on, it might be worth trying a whole bunch of combinations. Another rabbit hole to pop down.

## Try XGBoost default

In [21]:
import xgboost as xgb

In [28]:
clf = xgb.XGBClassifier(random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9503260225251926
Recall:  0.9203214695752009
ROC score:  0.9592759173260763
F1 score:  0.9350831146106736
Accuracy score:  0.9954660626848404


## Try XGBoost Random Forest

In [29]:
clf = xgb.XGBRFClassifier(n_estimators=100, random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9416342412451362
Recall:  0.6668197474167623
ROC score:  0.8326496486817733
F1 score:  0.7807501008200027
Accuracy score:  0.9867118566738091


## Try Light GBM Gradient Boosting Decision Tree

NB: Light GBM supports categorical variables in numerical encoding. According to their documentation, categorical variables are faster than one-hot encoding, however, I left the preprocessing the same as with the rest to keep it comparable.

In [32]:
import lightgbm as lgb

In [33]:
clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9244900393653823
Recall:  0.8897818599311137
ROC score:  0.9435542009604381
F1 score:  0.9068039548353126
Accuracy score:  0.9935107258373322


## Try Light GBM Gradient-based One-Side Sampling

In [34]:
clf = lgb.LGBMClassifier(boosting_type='goss', n_estimators=100, random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9208624570548514
Recall:  0.892422502870264
ROC score:  0.9448006116635373
F1 score:  0.9064194507608887
Accuracy score:  0.9934618424161445


## Try Light GBM Dropouts meet Multiple Additive Regression Trees

In [35]:
clf = lgb.LGBMClassifier(boosting_type='dart', n_estimators=100, random_state=42)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9376590330788804
Recall:  0.8461538461538461
ROC score:  0.9220421723462624
F1 score:  0.8895594447797225
Accuracy score:  0.9925452782688751


## Try Light GBM Random Forest

In [44]:
# RF type classifier requires variables bagging_freq > 0 and bagging_fraction in (0,1)
clf = lgb.LGBMClassifier(boosting_type='rf', n_estimators=100, random_state=42, bagging_freq=1, bagging_fraction=0.9)

clf.fit(train_X, np.ravel(train_y))

preds = pd.DataFrame(clf.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.813812744749021
Recall:  0.7873708381171067
ROC score:  0.8903721049842541
F1 score:  0.8003734609324853
Accuracy score:  0.986064151343072


## Try Shogun ML AveragedPerceptron

In [62]:
from shogun import (
    AveragedPerceptron, GaussianKernel, LibSVM, RealFeatures, BinaryLabels, PrecisionMeasure
    )

In [64]:
features_train = RealFeatures(train_X.to_numpy().transpose())
features_test = RealFeatures(valid_X.to_numpy().transpose())
labels_train = BinaryLabels((train_y.astype(int) * 2 - 1).to_numpy().reshape(-1))
labels_test = BinaryLabels((valid_y.astype(int) * 2 - 1).to_numpy().reshape(-1))

In [53]:
learn_rate = 1.0
max_iter = 1000
perceptron = AveragedPerceptron(features_train, labels_train)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
perceptron.train()
perceptron.set_features(features_test)

In [71]:
predictions = perceptron.apply_binary()

In [72]:
preds = pd.DataFrame(predictions.get_values(), columns=['predictions'])

In [73]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

## Try Shogun ML Kernel Support Vector Machine

In [56]:
features_train = RealFeatures(train_X.to_numpy().transpose())
features_test = RealFeatures(valid_X.to_numpy().transpose())
labels_train = BinaryLabels((train_y.astype(int) * 2 - 1).to_numpy().reshape(-1))
labels_test = BinaryLabels((train_y.astype(int) * 2 - 1).to_numpy().reshape(-1))

C = 1.0
epsilon = 0.001
gauss_kernel = GaussianKernel(features_train, features_train, 15)

svm = LibSVM(C, gauss_kernel, labels_train)
svm.set_epsilon(epsilon)
svm.train()

In [74]:
preds = pd.DataFrame(svm.apply_binary(features_test).get_values(), columns=['predictions'])

In [75]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

Okay, maybe I'm just a dunce, but this is not very cooperative.