In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

In [None]:
# Učitavanje dataset-a
data = pd.read_csv('dataset.csv')
df = pd.DataFrame(data)

In [None]:
# Handling null 'No info' data in the smoking_history feature
most_common_value = df['smoking_history'].replace('No Info', pd.NA).mode().values[0]
df['smoking_history'] = df['smoking_history'].replace('No Info', most_common_value)

In [None]:
# age feature visualisation
plt.hist(df['age'], bins=10, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Histogram of Age')
plt.show()

In [None]:
# gender feature visualisation
polovi_counts = df['gender'].value_counts()
plt.pie(polovi_counts, labels=polovi_counts.index, autopct='%1.1f%%')
plt.legend(polovi_counts.index)
plt.show()

In [None]:
# heart_disease feature visualisation
heart_disease_counts = df['heart_disease'].value_counts()

heart_disease_counts.plot(kind='bar')

plt.title('Broj osoba s srčanom bolešću')
plt.xlabel('Nema srčanu bolest (0) / Ima srčanu bolest (1)')
plt.ylabel('Broj osoba')

plt.show()

In [None]:
# smoking_history feature visualisation
smoking_history_counts = df['smoking_history'].value_counts()

smoking_history_counts.plot(kind='pie', autopct='%1.1f%%')

plt.title('Smoking History')

plt.legend(labels=smoking_history_counts.index)

plt.show()

In [None]:
# encoding categorical features
label_encoder = LabelEncoder()

# Encoding gender features
df['gender'] = label_encoder.fit_transform(df['gender'])

# Encoding smoking_history features
smoking_order = ['never', 'former', 'not current', 'current', 'ever']
label_encoder.fit(smoking_order)
df['smoking_history'] = label_encoder.transform(df['smoking_history'])

In [None]:
# Preparation variables
X = df.drop('diabetes', axis=1)
y = df['diabetes']

undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_undersampled, y_undersampled, test_size=0.25, random_state=37)

In [None]:
# Tree construction
model = DecisionTreeClassifier(max_depth=8, criterion="entropy")
model.fit(X_train, y_train)

In [None]:
# Tree visualisation
plt.figure(figsize=(20, 13))
plot_tree(model, feature_names=X.columns, class_names=['0', '1'], filled=True)
plt.show()

In [None]:
# Test set predicion
y_pred = model.predict(X_test)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens")

plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")

plt.show()

In [None]:
# Classification report
report = classification_report(y_test, y_pred)
print("Izvještaj klasifikacije:\n", report)

In [None]:
# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Tačnost predikcije: ", accuracy)

In [None]:
# New data prediction
new_data = {
    'gender': [0], 'age': [82.0], 'hypertension': [1], 'heart_disease': [1], 'smoking_history': [2], 'bmi': [35.11], 'HbA1c_level': [6.7], 'blood_glucose_level': [78], 'diabetes': [0]
}

new_data_df = pd.DataFrame(new_data)
X_new = new_data_df.drop(['diabetes'], axis=1)
y_new_pred = model.predict(X_new)

if y_new_pred[0] == 0:
    print("Osoba neće imati dijabetes.")
else:
    print("Osoba će imati dijabetes.")