In [19]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
df["suicide_rate_class"] = pd.cut(df["suicides/100k pop"], bins=[-1, 10, 20, 100], labels=["low", "medium", "high"])
df.dropna(inplace=True)

In [3]:
X = df.drop(["suicide_rate_class", "suicides/100k pop"], axis=1)
y = df["suicide_rate_class"]

In [4]:
X = pd.get_dummies(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
cv_fold = 5

# Decision Tree

In [7]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Logistic Regression

In [13]:
lr = LogisticRegression(solver='liblinear', random_state=42)
lr.fit(X_train, y_train)

# KNN

In [14]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Calculate accuracy, precision, recall, specificity, sensitivity, F1-score

In [15]:
scores_cv = {
    'dt': cross_val_score(dt, X, y, cv=cv_fold),
    'lr': cross_val_score(lr, X, y, cv=cv_fold),
    'knn': cross_val_score(knn, X, y, cv=cv_fold),
}

In [30]:
mean_accu = [np.mean(scores_cv.get('dt')), np.mean(scores_cv.get('lr')), np.mean(scores_cv.get('knn'))]

mean_precision = [
    precision_score(y, dt.predict(X), average='weighted'),
    precision_score(y, lr.predict(X), average='weighted'),
    precision_score(y, knn.predict(X), average='weighted'),
]

mean_recall = [
    recall_score(y, dt.predict(X), average='weighted'),
    recall_score(y, lr.predict(X), average='weighted'),
    recall_score(y, knn.predict(X), average='weighted'),
]

cm = {
    'dt': confusion_matrix(y, cross_val_predict(dt, X, y, cv=cv_fold)),
    'lr': confusion_matrix(y, cross_val_predict(lr, X, y, cv=cv_fold)),
    'knn': confusion_matrix(y, cross_val_predict(knn, X, y, cv=cv_fold)),
}

mean_specificity = [
    cm['dt'][0, 0] / (cm['dt'][0, 0] + cm['dt'][0, 1]), 
    cm['lr'][0, 0] / (cm['lr'][0, 0] + cm['lr'][0, 1]),
    cm['knn'][0, 0] / (cm['knn'][0, 0] + cm['knn'][0, 1]),
]

mean_sensitivity = mean_recall

mean_f1_score = [
    f1_score(y, dt.predict(X), average='weighted'),
    f1_score(y, lr.predict(X), average='weighted'),
    f1_score(y, knn.predict(X), average='weighted'),
]


# Comparison

## Decision Tree

In [37]:
print(f'Accuracy:\t{mean_accu[0]}')
print(f'Precision:\t{mean_precision[0]}')
print(f'Recall:\t\t{mean_recall[0]}')
print(f'Specificity:\t{mean_specificity[0]}')
print(f'Sensitivity:\t{mean_sensitivity[0]}')
print(f'F1-score:\t{mean_f1_score[0]}')

Accuracy:	0.8704446032571033
Precision:	0.9916001161365423
Recall:		0.9915875495733686
Specificity:	0.9914529914529915
Sensitivity:	0.9915875495733686
F1-score:	0.9915906538554574


# Logistic Regression

In [45]:
print(f'Accuracy:\t{mean_accu[1]}')
print(f'Precision:\t{mean_precision[1]}')
print(f'Recall:\t\t{mean_recall[1]}')
print(f'Specificity:\t{mean_specificity[1]}')
print(f'Sensitivity:\t{mean_sensitivity[1]}')
print(f'F1-score:\t{mean_f1_score[1]}')

Accuracy:	0.8125145818895818
Precision:	0.8479145807229163
Recall:		0.8095180867684173
Specificity:	0.6845764854614412
Sensitivity:	0.8095180867684173
F1-score:	0.7726085317468969


# KNN

In [40]:
print(f'Accuracy:\t{mean_accu[2]}')
print(f'Precision:\t{mean_precision[2]}')
print(f'Recall:\t\t{mean_recall[2]}')
print(f'Specificity:\t{mean_specificity[2]}')
print(f'Sensitivity:\t{mean_sensitivity[2]}')
print(f'F1-score:\t{mean_f1_score[2]}')

Accuracy:	0.5301027951027951
Precision:	0.6583559390431903
Recall:		0.683331330368946
Specificity:	0.21486486486486486
Sensitivity:	0.683331330368946
F1-score:	0.651061280572589
