# Classification

## We will try to classify extreme OSA cases: IAH <= 10 vs IAH >=30

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split # splitting the data
from sklearn.linear_model import LogisticRegression # model algorithm
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.metrics import precision_score # evaluation metric
from sklearn.metrics import classification_report # evaluation metric
from sklearn.metrics import confusion_matrix # evaluation metric
from sklearn.metrics import log_loss # evaluation metric
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_excel("OSA_extreme_male.xlsx")
df_OSA_male = pd.DataFrame(data)

In [3]:
df_OSA_male.dtypes

Patient      object
Gender       object
IAH         float64
Weight        int64
Height        int64
Age           int64
Cervical      int64
OSA          object
BMI         float64
dtype: object

In [4]:
# Define 1 for Severe and 0 for Healthy OSA

# osa = {'Healthy': 0,'Severe': 1} 
# df_OSA_male.OSA = [osa[item] for item in df_OSA_male.OSA] 

In [5]:
df_OSA_male.head()

Unnamed: 0,Patient,Gender,IAH,Weight,Height,Age,Cervical,OSA,BMI
0,P0002,hombre,29.6,119,174,56,48,Severe,39.30506
1,P0004,hombre,19.7,78,168,39,42,Severe,27.636054
2,P0005,hombre,9.0,80,173,32,40,Healthy,26.729927
3,P0006,hombre,2.0,109,190,32,42,Healthy,30.193906
4,P0009,hombre,22.0,72,165,40,42,Severe,26.446281


## Create Training and Test sets and apply scaling

In [6]:
# Separate the features and labels

x = df_OSA_male[['Age','Cervical', 'BMI']]
y = df_OSA_male['OSA']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Build Models

- ### Logistic regression

In [8]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(x_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(x_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.66
Accuracy of Logistic regression classifier on test set: 0.64


- ### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(x_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(x_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(x_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.62


- ### K-Nearest Neighbors

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(x_test, y_test)))

Accuracy of K-NN classifier on training set: 0.76
Accuracy of K-NN classifier on test set: 0.56


- ### Linear Discriminant Analysis

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(x_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(x_test, y_test)))

Accuracy of LDA classifier on training set: 0.65
Accuracy of LDA classifier on test set: 0.65


- ### Gaussian Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(x_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(x_test, y_test)))

Accuracy of GNB classifier on training set: 0.63
Accuracy of GNB classifier on test set: 0.65


- ### Support Vector Machine

In [13]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(x_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(x_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(x_test, y_test)))

Accuracy of SVM classifier on training set: 0.70
Accuracy of SVM classifier on test set: 0.62


## Confusion matrix

In [14]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pred = knn.predict(x_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[19 15]
 [20 26]]
              precision    recall  f1-score   support

     Healthy       0.49      0.56      0.52        34
      Severe       0.63      0.57      0.60        46

    accuracy                           0.56        80
   macro avg       0.56      0.56      0.56        80
weighted avg       0.57      0.56      0.56        80

