In [1]:
import pandas as pd
import xgboost as xgb
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import pickle


In [2]:
from sklearn import datasets


In [9]:
iris = datasets.load_iris()
X = iris.data[:, :3]  # we only take the first two features.
df = pd.DataFrame(X,columns = ["sepal length","sepal width", "petal length"])
df['classIndex'] = iris.target

In [11]:
from sklearn.model_selection import train_test_split
# Split to train/test
training_df, test_df = train_test_split(df)

In [12]:
training_df = training_df[(training_df['classIndex']==0)|(training_df['classIndex']==1)]
test_df = test_df[(test_df['classIndex']==0)|(test_df['classIndex']==1)]

In [15]:
X_train = training_df.drop(columns = ['classIndex'])
y_train = training_df.classIndex

In [16]:
X_test = test_df.drop(columns = ['classIndex'])
y_test = test_df.classIndex

In [17]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_train, y_train)

In [19]:
# all

# in sample
preds_prob_train = xgb_cl.predict_proba(X_train)[::,1] # get probability of 1 - out of sample

# out of sample
preds_prob_test = xgb_cl.predict_proba(X_test)[::,1] # get probability of 1 - out of sample

In [21]:
# tune optimal probability threshold
threshold = [x*0.001 for x in range(1000)]
score = [metrics.f1_score(y_test, preds_prob_test > t) for t in threshold]
optimal_threshold = threshold[np.argmax(score)]

np.max(score), threshold[np.argmax(score)]


(1.0, 0.026000000000000002)

In [22]:
preds_test_optimal = [int(x) for x in preds_prob_test > optimal_threshold]
# out of sample
print(metrics.f1_score(y_test, preds_test_optimal))
print(metrics.confusion_matrix(y_test, preds_test_optimal))
print(metrics.roc_auc_score(y_test, preds_prob_test))

1.0
[[13  0]
 [ 0 11]]
1.0


In [24]:
model_path = 'models/flow_mosal.pkl'

with open(model_path, 'wb') as fid:
    pickle.dump(xgb_cl, fid)