In [1]:
import pandas as pd
import pre_processing_data

# from sklearn.datasets import load_breast_cancer
# data = load_breast_cancer()
# list(data.target_names) # 0 is malignant, 1 is benign
# frame = pd.DataFrame(data.target, columns=['target'])
# frame['target'].value_counts()

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from costcla.metrics import cost_loss

data = pd.read_csv("venv/data/fetal_health.csv")
X_train, X_test, y_train, y_test = pre_processing_data.pre_processing_binary(data)

#fp, fn, tp, tn
# create an example-dependent cost-matrix required by costclas
fp = np.full((y_test.shape[0],1), 1)
fn = np.full((y_test.shape[0],1), 4)
tp = np.zeros((y_test.shape[0],1))
tn = np.zeros((y_test.shape[0],1))
cost_matrix = np.hstack((fp, fn, tp, tn))

# create a classic cost-matrix
cost_m = [[0 , 4], [1, 0]]

names = ['random forest', 'linear SVM']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0),
               SVC(kernel='linear', C=1)]

for name, clf in zip(names, classifiers):
  print(name)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))

  conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
  print(conf_m)
  print(np.sum(conf_m * cost_m))
  loss = cost_loss(y_test, y_pred, cost_matrix)
  print("%d\n" %loss)

here 1655 471 0
random forest
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       414
           1       0.89      0.79      0.84       118

    accuracy                           0.93       532
   macro avg       0.92      0.88      0.90       532
weighted avg       0.93      0.93      0.93       532

[[403  25]
 [ 11  93]]
111
111

linear SVM
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       414
           1       0.79      0.77      0.78       118

    accuracy                           0.90       532
   macro avg       0.86      0.86      0.86       532
weighted avg       0.90      0.90      0.90       532

[[390  27]
 [ 24  91]]
132
132



In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from costcla.models import BayesMinimumRiskClassifier

cost_min_loss = []
print("no cost minimization")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
model = clf.fit(X_train, y_train)
pred_test = model.predict(X_test)
print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
cost_min_loss.append(loss)
print("%d\n" %loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides


print("no calibration")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
model = clf.fit(X_train, y_train)
prob_test = model.predict_proba(X_test)
bmr = BayesMinimumRiskClassifier(calibration=False)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
cost_min_loss.append(loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

print("costcla calibration on training set")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
model = clf.fit(X_train, y_train)
prob_train = model.predict_proba(X_train)
bmr = BayesMinimumRiskClassifier(calibration=True)
bmr.fit(y_train, prob_train)
prob_test = model.predict_proba(X_test)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
cost_min_loss.append(loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

print("\nsigmoid calibration")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
cc = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
model = cc.fit(X_train, y_train)
prob_test = model.predict_proba(X_test)
bmr = BayesMinimumRiskClassifier(calibration=False)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
cost_min_loss.append(loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

print("\nisotonic calibration")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
cc = CalibratedClassifierCV(clf, method="isotonic", cv=3)
model = cc.fit(X_train, y_train)
prob_test = model.predict_proba(X_test)
bmr = BayesMinimumRiskClassifier(calibration=False)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
cost_min_loss.append(loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides





no cost minimization
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       414
           1       0.89      0.79      0.84       118

    accuracy                           0.93       532
   macro avg       0.92      0.88      0.90       532
weighted avg       0.93      0.93      0.93       532

111

[[403  25]
 [ 11  93]]
no calibration
              precision    recall  f1-score   support

           0       0.98      0.93      0.95       414
           1       0.78      0.93      0.85       118

    accuracy                           0.93       532
   macro avg       0.88      0.93      0.90       532
weighted avg       0.94      0.93      0.93       532

63

[[383   8]
 [ 31 110]]
costcla calibration on training set
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       414
           1       0.91      0.75      0.82       118

    accuracy                           0.93       532
 

In [4]:

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from costcla.metrics import cost_loss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter


clf = RandomForestClassifier(n_estimators=100, random_state=0)
print("without sampling")
print(Counter(y_train))
#0: 149, 1: 249

model = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred).T) # transpose to align with slides
loss = cost_loss(y_test, y_pred, cost_matrix)
print("%d\n" %loss)

# print("with undersampling")
# sampler = RandomUnderSampler(sampling_strategy={0: 149, 1: 37}, random_state=1)
# X_rs, y_rs = sampler.fit_resample(X_train, y_train)
# print(Counter(y_rs))

# model = clf.fit(X_rs, y_rs)
# y_pred = clf.predict(X_test)

# print(classification_report(y_test, y_pred, target_names=data.target_names))
# print(confusion_matrix(y_test, y_pred).T) # transpose to align with slides
# loss = cost_loss(y_test, y_pred, cost_matrix)
# print("%d\n" %loss)

# print("with oversampling")
# sampler = RandomOverSampler(sampling_strategy={0: 1000, 1: 249}, random_state=1)
# X_rs, y_rs = sampler.fit_resample(X_train, y_train)
# print(Counter(y_rs))

# model = clf.fit(X_rs, y_rs)
# y_pred = clf.predict(X_test)

# print(classification_report(y_test, y_pred, target_names=data.target_names))
# print(confusion_matrix(y_test, y_pred).T) # transpose to align with slides
# loss = cost_loss(y_test, y_pred, cost_matrix)
# print("%d\n" %loss)


count_y = Counter(y_train)
major_class = count_y[0]
minor_class = count_y[1]

c = [1, 4]
cost_major = int(major_class/c[1])
cost_minor = int(minor_class/c[0])


print("with combination")
sampler = RandomUnderSampler(sampling_strategy={0: cost_major, 1: minor_class}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
sampler = RandomOverSampler(sampling_strategy={0: cost_major, 1: cost_minor}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_rs, y_rs)
print(Counter(y_rs))

model = clf.fit(X_rs, y_rs)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred).T) # transpose to align with slides
loss = cost_loss(y_test, y_pred, cost_matrix)
print("%d\n" %loss)


without sampling
Counter({0: 1241, 1: 353})
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       414
           1       0.89      0.79      0.84       118

    accuracy                           0.93       532
   macro avg       0.92      0.88      0.90       532
weighted avg       0.93      0.93      0.93       532

[[403  25]
 [ 11  93]]
111

with combination
Counter({1: 353, 0: 310})
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       414
           1       0.72      0.89      0.80       118

    accuracy                           0.90       532
   macro avg       0.85      0.90      0.87       532
weighted avg       0.91      0.90      0.90       532

[[374  13]
 [ 40 105]]
92



In [6]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from costcla.metrics import cost_loss


print("without weights")
clf = RandomForestClassifier(n_estimators=10, random_state=0)
#clf = SVC(kernel='linear', probability=False, C=1)
#clf = DecisionTreeClassifier()
model = clf.fit(X_train, y_train)
pred_test = model.predict(X_test)

print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

print("\nwith weights")
# now create the sample weights according to y
weights = np.zeros(y_train.shape[0])
weights[np.where(y_train == 1)] = 4;
weights[np.where(y_train == 0)] = 1;


model = clf.fit(X_train, y_train, weights)
pred_test = clf.predict(X_test)

print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

print("\nwith weights (alternative)")
clf = RandomForestClassifier(n_estimators=10, random_state=0, class_weight={0: 1, 1: 4})
model = clf.fit(X_train, y_train)
pred_test = model.predict(X_test)

print(classification_report(y_test, pred_test))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" %loss)
print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides

without weights
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       414
           1       0.92      0.74      0.82       118

    accuracy                           0.93       532
   macro avg       0.92      0.86      0.89       532
weighted avg       0.93      0.93      0.92       532

132

[[406  31]
 [  8  87]]

with weights
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       414
           1       0.89      0.71      0.79       118

    accuracy                           0.92       532
   macro avg       0.91      0.84      0.87       532
weighted avg       0.92      0.92      0.91       532

146

[[404  34]
 [ 10  84]]

with weights (alternative)
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       414
           1       0.89      0.71      0.79       118

    accuracy                           0.92       532
   macro avg  