In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import time


def train_model(dataset, model_type='svm', **kwargs):
    """
    Funkcja trenująca model na podstawie podanego zbioru danych i modelu.

    Args:
        dataset: Ramka danych zawierająca cechy i etykiety.
        model_type: Typ modelu do wytrenowania ('svm' dla SVM, 'logistic' dla regresji logistycznej, 'xgboost' dla XGBoost).
        **kwargs: Dodatkowe argumenty przekazywane do konstruktora modelu.

    Returns:
        model: Wytrenowany model.
        accuracy: Dokładność modelu na zestawie testowym.
    """
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if model_type == 'svm':
        model = SVC(**kwargs)
    elif model_type == 'logistic':
        model = LogisticRegression(**kwargs)
    elif model_type == 'xgboost':
        model = xgb.XGBClassifier(**kwargs)

    start_train_time = time.time()
    model.fit(X_train, y_train)
    end_train_time = time.time()
    train_time = end_train_time - start_train_time
    print(train_time)

    start_predict_time = time.time()
    y_pred = model.predict(X_test)
    end_predict_time = time.time()
    predict_time = end_predict_time - start_predict_time
    print(predict_time)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    return model, accuracy, precision, recall, cm

## Diabetes

In [2]:
# Zaimportuj dane diabetes
diabetes = pd.read_csv("diabetes_data.csv")

# Wytrenuj modele SVM, regresji logistycznej i XGBoost za pomocą funkcji
svm_model, svm_accuracy, svm_precision, svm_recall, svm_cm = train_model(diabetes, model_type='svm', kernel='rbf', C=1.0)
logistic_model, logistic_accuracy, logistic_precision, logistic_recall, logistic_cm = train_model(diabetes, model_type='logistic')
xgboost_model, xgboost_accuracy, xgboost_precision, xgboost_recall, xgboost_cm = train_model(diabetes, model_type='xgboost')

print("Dokładność modelu SVM:", svm_accuracy)
print("Dokładność modelu regresji logistycznej: ", logistic_accuracy)
print("Dokładność modelu XGBoost:", xgboost_accuracy)

print("Precyzja modelu SVM:", svm_precision)
print("Precyzja modelu regresji logistycznej: ", logistic_precision)
print("Precyzja modelu XGBoost:", xgboost_precision)

print("Czułość modelu SVM:", svm_recall)
print("Czułość modelu regresji logistycznej: ", logistic_recall)
print("Czułość modelu XGBoost:", xgboost_recall)

print("Macierz pomyłek modelu SVM: ", svm_cm)
print("Macierz pomyłek regresji logistycznej: ", logistic_cm)
print("Macierz pomyłek modelu XGBoost: ", xgboost_cm)


129.84508776664734
26.119346618652344


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.539376974105835
0.005644321441650391
0.559699535369873
0.027471303939819336
Dokładność modelu SVM: 0.7482848857769291
Dokładność modelu regresji logistycznej:  0.7440413041940731
Dokładność modelu XGBoost: 0.7488506966546432
Precyzja modelu SVM: 0.7212221095334685
Precyzja modelu regresji logistycznej:  0.7307588805166846
Precyzja modelu XGBoost: 0.7285079696890515
Czułość modelu SVM: 0.8070648318910484
Czułość modelu regresji logistycznej:  0.7704638955880266
Czułość modelu XGBoost: 0.7910341892467017
Macierz pomyłek modelu SVM:  [[4891 2199]
 [1360 5689]]
Macierz pomyłek regresji logistycznej:  [[5089 2001]
 [1618 5431]]
Macierz pomyłek modelu XGBoost:  [[5012 2078]
 [1473 5576]]


## Hypertension

In [3]:
# Zaimportuj dane hypertension
hypertension = pd.read_csv("hypertension_data.csv")
NaN_count = hypertension.isna().sum()
hypertension = hypertension.dropna()

# Wytrenuj modele SVM, regresji logistycznej i XGBoost za pomocą funkcji
svm_model, svm_accuracy, svm_precision, svm_recall, svm_cm = train_model(hypertension, model_type='svm', kernel='rbf', C=1.0)
logistic_model, logistic_accuracy, logistic_precision, logistic_recall, logistic_cm = train_model(hypertension, model_type='logistic')
xgboost_model, xgboost_accuracy, xgboost_precision, xgboost_recall, xgboost_cm = train_model(hypertension, model_type='xgboost')

print("Dokładność modelu SVM:", svm_accuracy)
print("Dokładność modelu regresji logistycznej:", logistic_accuracy)
print("Dokładność modelu XGBoost:", xgboost_accuracy)

print("Precyzja modelu SVM:", svm_precision)
print("Precyzja modelu regresji logistycznej:", logistic_precision)
print("Precyzja modelu XGBoost:", xgboost_precision)

print("Czułość modelu SVM:", svm_recall)
print("Czułość modelu regresji logistycznej:", logistic_recall)
print("Czułość modelu XGBoost:", xgboost_recall)

print("Macierz pomyłek modelu SVM: ", svm_cm)
print("Macierz pomyłek regresji logistycznej: ", logistic_cm)
print("Macierz pomyłek modelu XGBoost: ", xgboost_cm)


17.52316379547119
5.047417879104614


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.21774744987487793
0.003140687942504883
0.29167842864990234
0.013245582580566406
Dokładność modelu SVM: 0.7536454336147352
Dokładność modelu regresji logistycznej: 0.8451650038372985
Dokładność modelu XGBoost: 1.0
Precyzja modelu SVM: 0.7468548634550476
Precyzja modelu regresji logistycznej: 0.8288146279949559
Precyzja modelu XGBoost: 1.0
Czułość modelu SVM: 0.841341168337366
Czułość modelu regresji logistycznej: 0.908745247148289
Czułość modelu XGBoost: 1.0
Macierz pomyłek modelu SVM:  [[1494  825]
 [ 459 2434]]
Macierz pomyłek regresji logistycznej:  [[1776  543]
 [ 264 2629]]
Macierz pomyłek modelu XGBoost:  [[2319    0]
 [   0 2893]]


## Stroke

In [4]:
stroke = pd.read_csv("stroke_data.csv")
NaN_count = stroke.isna().sum()
stroke = stroke.dropna()

svm_model, svm_accuracy, svm_precision, svm_recall, svm_cm = train_model(stroke, model_type='svm', kernel='rbf', C=1.0)
logistic_model, logistic_accuracy, logistic_precision, logistic_recall, logistic_cm = train_model(stroke, model_type='logistic')
xgboost_model, xgboost_accuracy, xgboost_precision, xgboost_recall, xgboost_cm = train_model(stroke, model_type='xgboost')


print("Dokładność modelu SVM:", svm_accuracy)
print("Dokładność modelu regresji logistycznej:", logistic_accuracy)
print("Dokładność modelu XGBoost:", xgboost_accuracy)

print("Precyzja modelu SVM:", svm_precision)
print("Precyzja modelu regresji logistycznej:", logistic_precision)
print("Precyzja modelu XGBoost:", xgboost_precision)

print("Czułość modelu SVM:", svm_recall)
print("Czułość modelu regresji logistycznej:", logistic_recall)
print("Czułość modelu XGBoost:", xgboost_recall)

print("Macierz pomyłek modelu SVM: ", svm_cm)
print("Macierz pomyłek regresji logistycznej: ", logistic_cm)
print("Macierz pomyłek modelu XGBoost: ", xgboost_cm)

47.70302224159241
10.791438102722168


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5876646041870117
0.0033016204833984375
0.7456278800964355
0.016674041748046875
Dokładność modelu SVM: 0.651185529210462
Dokładność modelu regresji logistycznej: 0.6786849181129309
Dokładność modelu XGBoost: 0.9973111708628697
Precyzja modelu SVM: 0.7076502732240437
Precyzja modelu regresji logistycznej: 0.7033587355348575
Precyzja modelu XGBoost: 0.9946236559139785
Czułość modelu SVM: 0.509090909090909
Czułość modelu regresji logistycznej: 0.6122850122850123
Czułość modelu XGBoost: 1.0
Macierz pomyłek modelu SVM:  [[3256  856]
 [1998 2072]]
Macierz pomyłek regresji logistycznej:  [[3061 1051]
 [1578 2492]]
Macierz pomyłek modelu XGBoost:  [[4090   22]
 [   0 4070]]


Strojenie hiperparaemtrów - GridSearchCV

In [5]:
def tune_svm_model(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
    svm = SVC()
    grid_search = GridSearchCV(svm, param_grid, cv=3, scoring='recall')
    grid_search.fit(X_train, y_train)
    best_svm_model = grid_search.best_estimator_
    recall = grid_search.best_score_
    return best_svm_model, recall

def tune_logistic_regression(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    }
    logistic = LogisticRegression(max_iter=1000)
    grid_search = GridSearchCV(logistic, param_grid, cv=3, scoring='recall')
    grid_search.fit(X_train, y_train)
    best_logistic_model = grid_search.best_estimator_
    recall = grid_search.best_score_
    return best_logistic_model, recall

def tune_xgboost_model(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7, 0.9],
        'colsample_bytree': [0.5, 0.7, 0.9]
    }
    xgboost = xgb.XGBClassifier()
    grid_search = GridSearchCV(xgboost, param_grid, cv=3, scoring='recall')
    grid_search.fit(X_train, y_train)
    best_xgboost_model = grid_search.best_estimator_
    recall = grid_search.best_score_
    return best_xgboost_model, recall

In [None]:
diabetes = pd.read_csv("diabetes_data.csv")
# Zaimportuj dane hypertension
hypertension = pd.read_csv("hypertension_data.csv")
NaN_count = hypertension.isna().sum()
hypertension = hypertension.dropna()
stroke = pd.read_csv("stroke_data.csv")
NaN_count = stroke.isna().sum()
stroke = stroke.dropna()
# best_svm_diabetes, accuracy_svm_diabetes = tune_svm_model(diabetes)
# best_logistic_diabetes, accuracy_logistic_diabetes = tune_logistic_regression(diabetes)
# print(best_logistic_diabetes, accuracy_logistic_diabetes)
# best_xgboost_diabetes, accuracy_xgboost_diabetes = tune_xgboost_model(diabetes)
# print(best_xgboost_diabetes, accuracy_xgboost_diabetes)

# best_svm_hypertension, accuracy_svm_hypertension = tune_svm_model(hypertension)
best_logistic_hypertension, accuracy_logistic_hypertension = tune_logistic_regression(hypertension)
print(best_logistic_hypertension, accuracy_logistic_hypertension)
best_xgboost_hypertension, accuracy_xgboost_hypertension = tune_xgboost_model(hypertension)
print(best_xgboost_hypertension, accuracy_xgboost_hypertension)

# best_svm_stroke, accuracy_svm_stroke = tune_svm_model(stroke)
# best_logistic_stroke, accuracy_logistic_stroke = tune_logistic_regression(stroke)
# print(best_logistic_stroke, accuracy_logistic_stroke)
# best_xgboost_stroke, accuracy_xgboost_stroke = tune_xgboost_model(stroke)
# print(best_xgboost_stroke, accuracy_xgboost_stroke)

LogisticRegression(C=0.1, max_iter=1000, solver='liblinear') 0.7682085710147014
LogisticRegression(C=1, max_iter=1000, solver='liblinear') 0.9257533703731449
