In [1]:
import pandas as pd
import numpy as np

In [113]:
# loading data
data = pd.read_csv("../data/data_without_na.csv")

<br>

### Data preprocessing

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [114]:
# seperating features and labels
X = data.drop("outcome", axis=1)
y = data.outcome

# feature selection
X.drop(["skin_thickness"], axis=1, inplace=True)

# splitting into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# normalizing X_train
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
# flattening y_train
y_train = np.ravel(y_train)

<br>

### Logistic Regression

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score, precision_score

In [115]:
# creating model
logistic_regression = LogisticRegression()

In [116]:
# training model
logistic_regression.fit(X_train, y_train)
# probabilities on training data
probs = logistic_regression.predict_proba(X_train)

In [117]:
# cross validation probabilities
cross_val_proba = cross_val_predict(logistic_regression, X_train, y_train, method="predict_proba", cv=5)

In [118]:
# predictions for given threshold
def custom_predict(probabilities, threshold): 
    return (probabilities[:, 1] > threshold).astype(int)

In [119]:
# scores on training data
thresh_predictions = custom_predict(probs, threshold=0.3)
print(recall_score(y_train, thresh_predictions))
print(precision_score(y_train, thresh_predictions))

0.7925311203319502
0.60062893081761


In [120]:
# scores on cross validation
thresh_predictions = custom_predict(cross_val_proba, threshold=0.3)
print(recall_score(y_train, thresh_predictions))
print(precision_score(y_train, thresh_predictions))

0.7842323651452282
0.6019108280254777


<br>

### Support vector machine

In [7]:
from sklearn.svm import SVC

In [121]:
support_vector_classifier = SVC(kernel="linear", probability=True)

In [122]:
# training model
support_vector_classifier.fit(X_train, y_train)
# probabilities on training data
probs = support_vector_classifier.predict_proba(X_train)

In [123]:
# cross validation probabilities
cross_val_proba = cross_val_predict(support_vector_classifier, X_train, y_train, method="predict_proba", cv=5)

In [124]:
# scores on training data
thresh_predictions = custom_predict(probs, threshold=0.3)
print(recall_score(y_train, thresh_predictions))
print(precision_score(y_train, thresh_predictions))

0.8049792531120332
0.610062893081761


In [125]:
# scores on cross validation
thresh_predictions = custom_predict(cross_val_proba, threshold=0.3)
print(recall_score(y_train, thresh_predictions))
print(precision_score(y_train, thresh_predictions))

0.7925311203319502
0.6044303797468354


<br>

### Checking performance on test set

In [111]:
from sklearn.metrics import accuracy_score

In [126]:
# transforming test data
X_test = scalar.transform(X_test)
y_test = np.ravel(y_test)

In [127]:
# making predictions
probs = support_vector_classifier.predict_proba(X_test)
predictions = custom_predict(probs, threshold=0.3)

In [128]:
# recall and precision score
print(recall_score(y_test, predictions))
print(precision_score(y_test, predictions))

0.8518518518518519
0.6052631578947368


In [129]:
# accuracy score
print(accuracy_score(y_test, predictions))

0.7532467532467533


<br>

### Saving the model

In [130]:
import joblib

In [132]:
# save model as 'support_vector_classifier.pkl'
joblib.dump(support_vector_classifier, "../models/support_vector_classifier.pkl")

['../models/support_vector_classifier.pkl']