In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
import plotly.graph_objects as go

In [3]:
df = pd.read_csv("DataSet2_1.csv", delimiter='|')
df.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [4]:
tr = list(df["fraud"].values).count(1)
fls = list(df["fraud"].values).count(0)

print("total:", len(df))
print('1:', tr)
print('0:', fls)
print(fls, '/', tr, '=', round(fls / tr))

total: 1879
1: 104
0: 1775
1775 / 104 = 17


In [5]:
x = df.drop("fraud", axis=1)
y = df["fraud"].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)
knn_prediction = knn_model.predict(x_test)
knn_prediction_proba = knn_model.predict_proba(x_test)[:,1]

In [7]:
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(x_train, y_train)
svm_prediction = svm_model.predict(x_test)
svm_prediction_proba = svm_model.predict_proba(x_test)[:,1]

In [8]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
nb_prediction = nb_model.predict(x_test)
nb_prediction_proba = nb_model.predict_proba(x_test)[:,1]

In [9]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)
dt_prediction = dt_model.predict(x_test)
dt_prediction_proba = dt_model.predict_proba(x_test)[:,1]

In [10]:
print("accuracy_score")
print("knn:", accuracy_score(y_test, knn_prediction) * 100)
print("svm:", accuracy_score(y_test, svm_prediction) * 100)
print("Naive Bayes:", accuracy_score(y_test, nb_prediction) * 100)
print("Decision Tree:", accuracy_score(y_test, dt_prediction) * 100)

accuracy_score
knn: 96.27659574468085
svm: 96.80851063829788
Naive Bayes: 86.17021276595744
Decision Tree: 97.07446808510637


In [11]:
print("roc_auc_score")
print("knn:", roc_auc_score(y_test, knn_prediction_proba) * 100)
print("svm:", roc_auc_score(y_test, svm_prediction_proba) * 100)
print("Naive Bayes:", roc_auc_score(y_test, nb_prediction_proba) * 100)
print("Decision Tree:", roc_auc_score(y_test, dt_prediction_proba) * 100)

roc_auc_score
knn: 47.95777426992896
svm: 97.00078926598263
Naive Bayes: 95.46172059984215
Decision Tree: 84.74743488555644


In [12]:
fpr_knn, tpr_knn, _ = roc_curve(y_test, knn_prediction_proba)
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm_prediction_proba)
fpr_nb, tpr_nb, _ = roc_curve(y_test, nb_prediction_proba)
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt_prediction_proba)

In [13]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr_knn, y=tpr_knn, name='roc_curve_knn'))
fig.add_trace(go.Scatter(x=fpr_svm, y=tpr_svm, name='roc_curve_svm'))
fig.add_trace(go.Scatter(x=fpr_nb, y=tpr_nb, name='roc_curve_nb'))
fig.add_trace(go.Scatter(x=fpr_dt, y=tpr_dt, name='roc_curve_dt'))

fig.update_layout(title="ROC curve",
                  xaxis_title="False positive rate",
                  yaxis_title="True positive rate")

Кроссвалидация

In [14]:
from sklearn.model_selection import cross_validate

In [15]:
model = KNeighborsClassifier()
knn_cv_model = cross_validate(model, x_train, y_train, cv=5, return_estimator=True)
ind = list(knn_cv_model["test_score"]).index(max(knn_cv_model["test_score"]))
best_knn_model = knn_cv_model["estimator"][ind]
knn_cv_prediction = best_knn_model.predict(x_test)
knn_cv_prediction_proba = best_knn_model.predict_proba(x_test)[:,1]

In [16]:
model = SVC(kernel="linear", probability=True)
svm_cv_model = cross_validate(model, x_train, y_train, cv=5, return_estimator=True)
ind = list(svm_cv_model["test_score"]).index(max(svm_cv_model["test_score"]))
best_svm_model = svm_cv_model["estimator"][ind]
svm_cv_prediction = best_svm_model.predict(x_test)
svm_cv_prediction_proba = best_svm_model.predict_proba(x_test)[:,1]

In [17]:
model = GaussianNB()
nb_cv_model = cross_validate(model, x_train, y_train, cv=5, return_estimator=True)
ind = list(nb_cv_model["test_score"]).index(max(nb_cv_model["test_score"]))
best_nb_model = nb_cv_model["estimator"][ind]
nb_cv_prediction = best_nb_model.predict(x_test)
nb_cv_prediction_proba = best_nb_model.predict_proba(x_test)[:,1]

In [18]:
model = DecisionTreeClassifier()
dt_cv_model = cross_validate(model, x_train, y_train, cv=5, return_estimator=True)
ind = list(dt_cv_model["test_score"]).index(max(dt_cv_model["test_score"]))
best_dt_model = dt_cv_model["estimator"][ind]
dt_cv_prediction = best_dt_model.predict(x_test)
dt_cv_prediction_proba = best_dt_model.predict_proba(x_test)[:,1]

In [19]:
print("accuracy_score cv")
print("knn_cv:", accuracy_score(y_test, knn_cv_prediction) * 100)
print("svm_cv:", accuracy_score(y_test, svm_cv_prediction) * 100)
print("Naive Bayes_cv:", accuracy_score(y_test, nb_cv_prediction) * 100)
print("Decision Tree_cv:", accuracy_score(y_test, dt_cv_prediction) * 100)

accuracy_score cv
knn_cv: 96.27659574468085
svm_cv: 97.07446808510637
Naive Bayes_cv: 91.22340425531915
Decision Tree_cv: 96.01063829787235


In [20]:
print("roc_auc_score cv")
print("knn_cv:", roc_auc_score(y_test, knn_cv_prediction_proba) * 100)
print("svm_cv:", roc_auc_score(y_test, svm_cv_prediction_proba) * 100)
print("Naive Bayes_cv:", roc_auc_score(y_test, nb_cv_prediction_proba) * 100)
print("Decision Tree_cv:", roc_auc_score(y_test, dt_cv_prediction_proba) * 100)

roc_auc_score cv
knn_cv: 51.558800315706385
svm_cv: 97.39542225730071
Naive Bayes_cv: 95.38279400157853
Decision Tree_cv: 80.76164167324387


In [21]:
fpr_knn, tpr_knn, _ = roc_curve(y_test, knn_cv_prediction_proba)
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm_cv_prediction_proba)
fpr_nb, tpr_nb, _ = roc_curve(y_test, nb_cv_prediction_proba)
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt_cv_prediction_proba)

In [22]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr_knn, y=tpr_knn, name='roc_curve_knn_cv'))
fig.add_trace(go.Scatter(x=fpr_svm, y=tpr_svm, name='roc_curve_svm_cv'))
fig.add_trace(go.Scatter(x=fpr_nb, y=tpr_nb, name='roc_curve_nb_cv'))
fig.add_trace(go.Scatter(x=fpr_dt, y=tpr_dt, name='roc_curve_dt_cv'))

fig.update_layout(title="ROC curve (cv)",
                  xaxis_title="False positive rate",
                  yaxis_title="True positive rate")

Композиция моделей

In [23]:
from sklearn.ensemble import VotingClassifier

In [25]:
classifier = VotingClassifier(estimators=
 [("svm", best_svm_model), ("nb", best_nb_model), ("dt", best_dt_model)])

classifier.fit(x_train, y_train)
res_ensemble = classifier.predict(x_test)

In [27]:
print("roc_auc_score", roc_auc_score(y_test, res_ensemble) * 100)

roc_auc_score 88.04262036306234


In [28]:
print("accuracy_score", accuracy_score(y_test, res_ensemble) * 100)

accuracy_score 96.80851063829788


In [29]:
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm_cv_prediction_proba)
fpr_nb, tpr_nb, _ = roc_curve(y_test, nb_cv_prediction_proba)
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt_cv_prediction_proba)
fpr_evc, tpr_evc, _ = roc_curve(y_test, res_ensemble)

In [30]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr_svm, y=tpr_svm, name='roc_curve_svm_cv'))
fig.add_trace(go.Scatter(x=fpr_nb, y=tpr_nb, name='roc_curve_nb_cv'))
fig.add_trace(go.Scatter(x=fpr_dt, y=tpr_dt, name='roc_curve_dt_cv'))
fig.add_trace(go.Scatter(x=fpr_evc, y=tpr_evc, name='roc_curve_evc'))

fig.update_layout(title="ROC curve (cv + evc)",
                  xaxis_title="False positive rate",
                  yaxis_title="True positive rate")