https://machinelearningmastery.com/cost-sensitive-decision-trees-for-imbalanced-classification/?__s=sedr9swwqoxuschybcvu

https://machinelearningmastery.com/cost-sensitive-svm-for-imbalanced-classification/?__s=sedr9swwqoxuschybcvu

In [1]:
from sklearn.datasets import make_classification

In [2]:
X , y = make_classification( n_samples=10000 , 
                             n_features=2 ,  
                             n_redundant=0 ,
                             n_clusters_per_class=1 , 
                             weights=[0.99] , 
                             flip_y=0 , 
                             random_state=3 )

In [3]:
from collections import Counter

In [4]:
counter = Counter(y)

print(counter)

Counter({0: 9900, 1: 100})


# <font color = red>Decision Tree</font>

In [5]:
from sklearn.tree import DecisionTreeClassifier

In [6]:
model = DecisionTreeClassifier( class_weight = 'balanced' )

In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [8]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [9]:
from sklearn.model_selection import cross_val_score

In [10]:
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [38]:
from numpy import mean

In [12]:
print('Mean ROC AUC: {0}'.format( mean(scores) ) )

Mean ROC AUC: 0.7444107744107745


## Grid Search Weighted Decision Tree

In [13]:
balance = [ {0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100} ]

param_grid = dict( class_weight = balance )

print( param_grid )

{'class_weight': [{0: 100, 1: 1}, {0: 10, 1: 1}, {0: 1, 1: 1}, {0: 1, 1: 10}, {0: 1, 1: 100}]}


In [14]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
grid = GridSearchCV( estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [17]:
grid_result = grid.fit(X, y)

In [18]:
print( grid_result.best_score_ )

0.7524410774410775


In [19]:
print( grid_result.best_params_ )

{'class_weight': {0: 1, 1: 10}}


In [24]:
means = grid_result.cv_results_['mean_test_score']

print( means )

[0.74057239 0.73228956 0.74065657 0.75244108 0.74784512]


In [25]:
stds = grid_result.cv_results_['std_test_score']

print( stds )

[0.0749361  0.07310019 0.06649053 0.0755724  0.07517203]


In [26]:
params = grid_result.cv_results_['params']

print( params )

[{'class_weight': {0: 100, 1: 1}}, {'class_weight': {0: 10, 1: 1}}, {'class_weight': {0: 1, 1: 1}}, {'class_weight': {0: 1, 1: 10}}, {'class_weight': {0: 1, 1: 100}}]


In [27]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.740572 (0.074936) with: {'class_weight': {0: 100, 1: 1}}
0.732290 (0.073100) with: {'class_weight': {0: 10, 1: 1}}
0.740657 (0.066491) with: {'class_weight': {0: 1, 1: 1}}
0.752441 (0.075572) with: {'class_weight': {0: 1, 1: 10}}
0.747845 (0.075172) with: {'class_weight': {0: 1, 1: 100}}


# <font color = violet>Support Vector Machines (SVM)</font>

In [31]:
from sklearn.svm import SVC

In [32]:
model = SVC(gamma='scale')

In [33]:
cv = RepeatedStratifiedKFold( n_splits=10 , n_repeats=3 , random_state=1 )

In [35]:
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

print( scores )

[0.49828283 0.71787879 0.72505051 0.82919192 0.64424242 0.5989899
 0.74929293 0.84060606 0.66323232 0.72010101 0.72323232 0.6330303
 0.85909091 0.8479798  0.59666667 0.44838384 0.85818182 0.61707071
 0.94555556 0.55222222 0.83434343 0.86232323 0.73212121 0.63757576
 0.54575758 0.74242424 0.54151515 0.86505051 0.66808081 0.62555556]


In [39]:
print( mean( scores ) )

0.7041010101010101


In [40]:
balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]

param_grid = dict(class_weight=balance)

print( param_grid )

{'class_weight': [{0: 100, 1: 1}, {0: 10, 1: 1}, {0: 1, 1: 1}, {0: 1, 1: 10}, {0: 1, 1: 100}]}


In [41]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [42]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [43]:
print( grid_result.best_score_ )

0.7524410774410775


In [44]:
print( grid_result.best_params_ )

{'class_weight': {0: 1, 1: 10}}


In [45]:
means = grid_result.cv_results_['mean_test_score']

print( means )

[0.74057239 0.73228956 0.74065657 0.75244108 0.74784512]


In [46]:
stds = grid_result.cv_results_['std_test_score']

print( stds )

[0.0749361  0.07310019 0.06649053 0.0755724  0.07517203]


In [47]:
params = grid_result.cv_results_['params']

print( params )

[{'class_weight': {0: 100, 1: 1}}, {'class_weight': {0: 10, 1: 1}}, {'class_weight': {0: 1, 1: 1}}, {'class_weight': {0: 1, 1: 10}}, {'class_weight': {0: 1, 1: 100}}]


In [48]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.740572 (0.074936) with: {'class_weight': {0: 100, 1: 1}}
0.732290 (0.073100) with: {'class_weight': {0: 10, 1: 1}}
0.740657 (0.066491) with: {'class_weight': {0: 1, 1: 1}}
0.752441 (0.075572) with: {'class_weight': {0: 1, 1: 10}}
0.747845 (0.075172) with: {'class_weight': {0: 1, 1: 100}}
