# Лабораторная работа №5
## Метрики качества классификации

### Исходные данные
**Датасет**: https://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions
<br>
**Предметная область**: Человеческая активность, различные движения
<br>**Список классов**:
1. WALKING
2. WALKING_UPSTAIRS
3. WALKING_DOWNSTAIRS
4. SITTING
5. STANDING
6. LAYING
7. STAND_TO_SIT
8. SIT_TO_STAND
9. SIT_TO_LIE
10. LIE_TO_SIT
11. STAND_TO_LIE
12. LIE_TO_STAND

**Количество атрибутов**: 561
<br>
**Основные атрибуты**: Измерения гироскопа и акселерометра в 3х осях, фильтры их значений
<br>
**Полный список атрибутов**: Features.txt
<br>
**Для данной задачи возьмем только два класса, которые сильно отличаются друг от друга**: WALKING_UPSTAIRS и LAYING
### Ход работы
#### Текст программы:


In [11]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import sklearn.model_selection as cv
import warnings
warnings.filterwarnings("ignore")

def load_data():
    data = np.loadtxt('../../HAPT Data Set/Train/X_train.txt', delimiter=' ')
    classes = np.loadtxt('../../HAPT Data Set/Train/y_train.txt')
    return data, classes


def main():
    file_data, file_classes = load_data()
    cut_file_data =[]
    cut_file_classes = []
    for i in range(len(file_data)):
        if(file_classes[i] in [2, 6]):
            cut_file_data.append(file_data[i])
            cut_file_classes.append(file_classes[i])

            #cut_file_data[i]=file_data[i]
            #cut_file_classes[i]=file_classes[i]

    cut_file_data = np.array(cut_file_data)
    cut_file_classes = np.array(cut_file_classes)

    data = Data(cut_file_data, cut_file_classes, 1,  0.7, 561)
    kFold = cv.KFold( n_splits=10, random_state=7, shuffle=True)
    lda = LDA()
    gnb = GaussianNB()

    print("Accuracy of methods:")
    calc_accuracy(data.data, data.classes, kFold, lda, gnb)

    print("Logarithmic Loss Results:")
    calc_loss(data.data, data.classes, kFold, lda, gnb)

    print("Area Under ROC Curve Results: ")
    #calc_curve(data.data, data.classes, kFold, lda, gnb)

    print("Confusion Matrixes:")
    calc_matrix(data.train_data, data.test_data, data.train_classes, data.test_classes, lda, gnb)

# Accuracy
def calc_accuracy(data, classes, kFold, lda, gnb):
    result = cv.cross_val_score(lda, data, classes, cv=kFold, scoring='accuracy')
    print(" LDA:")
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())

    result = cv.cross_val_score(gnb, data, classes, cv=kFold, scoring='accuracy')
    print(" Gaussian:")
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())


# Logarithmic Loss
def calc_loss(data, classes, kFold, lda, gnb):
    result = cv.cross_val_score(lda, data, classes, cv=kFold, scoring='neg_log_loss')
    print(" LDA:")
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())
    result = cv.cross_val_score(gnb, data, classes, cv=kFold, scoring='neg_log_loss')
    print(" Gaussian:")
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())


# Area Under ROC Curve
def calc_curve(ds_attr, ds_class, kFold, lda, gnb):
    result = cv.cross_val_score(lda, ds_attr, ds_class, cv=kFold, scoring='roc_auc')
    print(" LDA:")
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())
    result = cv.cross_val_score(gnb, ds_attr, ds_class, cv=kFold, scoring='roc_auc')
    print(" Gaussian: %0.5f (%0.5f)" % (result.mean(), result.std() ))
    print(" - mean: %0.5f" % result.mean())
    print(" - standart deviation: %0.5f" % result.std())


# Confusion Matrix
def calc_matrix(X_train, X_test, Y_train, Y_test, lda, gnb):
    gnb.fit(X_train, Y_train)
    gnb_predicted = gnb.predict(X_test)
    gnb_matrix = confusion_matrix(Y_test, gnb_predicted)
    print(" - GaussianNB:")
    print(gnb_matrix)
    lda.fit(X_train, Y_train)
    lda_predicted = lda.predict(X_test)
    lda_matrix = confusion_matrix(Y_test, lda_predicted)
    print(" - LDA:")
    print(lda_matrix)

    # Classification Report
    print("Classification Reports:")
    lda_r = classification_report(Y_test, lda_predicted)
    print(' - LDA:')
    print(lda_r)
    gaus_r = classification_report(Y_test, gnb_predicted)
    print(" - GaussianNB:")
    print(gaus_r)



В качестве оцениваемых методов классификации выберем Байесовский классификатор (GaussianNB) и линейный дискременантный анализ(LDA).
<br><br>
Определим на нашем наборе данных следующие метрики:
<br>
1. Точность классификации.<br>
2. Логарифм функции правдоподобия.<br>
3. Область под кривой ошибок.


In [12]:
    file_data, file_classes = load_data()
    cut_file_data =[]
    cut_file_classes = []
    for i in range(len(file_data)):
        if(file_classes[i] in [2, 6]):
            cut_file_data.append(file_data[i])
            cut_file_classes.append(file_classes[i])

    cut_file_data = np.array(cut_file_data)
    cut_file_classes = np.array(cut_file_classes)

    data = Data(cut_file_data, cut_file_classes, 1,  0.7, 561)
    kFold = cv.KFold( n_splits=10, random_state=7, shuffle=True)
    lda = LDA()
    gnb = GaussianNB()

    print("Accuracy of methods:")
    calc_accuracy(data.data, data.classes, kFold, lda, gnb)

    print("Logarithmic Loss Results:")
    calc_loss(data.data, data.classes, kFold, lda, gnb)

    print("Area Under ROC Curve Results: ")
    calc_curve(data.data, data.classes, kFold, lda, gnb)

Accuracy of methods:
 LDA:
 - mean: 1.00000
 - standart deviation: 0.00000
 Gaussian:
 - mean: 0.99236
 - standart deviation: 0.00490
Logarithmic Loss Results:
 LDA:
 - mean: -0.00000
 - standart deviation: 0.00000
 Gaussian:
 - mean: -0.26394
 - standart deviation: 0.16930
Area Under ROC Curve Results: 
 LDA:
 - mean: 1.00000
 - standart deviation: 0.00000
 Gaussian: 0.99462 (0.00329)
 - mean: 0.99462
 - standart deviation: 0.00329


и получим отчет классификации:



In [13]:
    print("Confusion Matrixes:")
    calc_matrix(data.train_data, data.test_data, data.train_classes, data.test_classes, lda, gnb)


Confusion Matrixes:
 - GaussianNB:
[[327   0]
 [  5 414]]
 - LDA:
[[327   0]
 [  0 419]]
Classification Reports:
 - LDA:
              precision    recall  f1-score   support

         2.0       1.00      1.00      1.00       327
         6.0       1.00      1.00      1.00       419

   micro avg       1.00      1.00      1.00       746
   macro avg       1.00      1.00      1.00       746
weighted avg       1.00      1.00      1.00       746

 - GaussianNB:
              precision    recall  f1-score   support

         2.0       0.98      1.00      0.99       327
         6.0       1.00      0.99      0.99       419

   micro avg       0.99      0.99      0.99       746
   macro avg       0.99      0.99      0.99       746
weighted avg       0.99      0.99      0.99       746



### Вывод
Основываясь на полученных метриках, можно сделать вывод, что классификаторы вполне применимы для даннного датасета.