In [5]:
from StratifiedGroupKFold import StratifiedGroupKFold as SGKF
from copy import deepcopy

# Для работы с табличными данными
import pandas as pd
import numpy as np

# Подрубаем рисовалки
import matplotlib.pyplot as plt
import seaborn as sns

# Необходимые метрики и препроцессинг
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, roc_curve, cohen_kappa_score, fbeta_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_decomposition import PLSRegression

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, cross_val_score

# Подрубаем progress bar
from tqdm.notebook import tqdm

# Отрубаем warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Для интерактивных графиков (в самом конце)
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
samples = pd.read_excel(u'20.01.23 список образцов Моча.xlsx', header=2)
samples.dropna(subset=['Gender'], inplace=True)

samples.fillna(0, inplace=True)
samples.reset_index(drop=True, inplace=True)

In [4]:
spectra1 = pd.read_csv('urine_chm001-250.csv')
spectra2 = pd.read_csv('urine_chm251-500.csv')
spectra_init = pd.concat([spectra1, spectra2], axis=0)

spectra = spectra_init.filter(regex='^Ch')
spectra = pd.DataFrame(spectra.values[:,::2]+spectra.values[:,1::2],
                       columns=['Ch_'+str(i) for i in range(18)]) # Sum of two LED spectra

# mmscaler = MinMaxScaler(feature_range=(-1,1))
# spectra = pd.DataFrame(mmscaler.fit_transform(spectra.T).T)
del(spectra1, spectra2)

frame_lol = samples[['Gender','Age','Dataset']+list(samples.filter(regex=r'_a$').columns)]
spectra['Dataset'] = spectra_init.Dataset.values
spectra = pd.merge(left=spectra, right=frame_lol, how='left', on='Dataset')
# spectra.dropna(subset=samples.filter(regex=r'_a$').columns, inplace=True)
spectra.dropna(how='any', inplace=True)
# spectra = spectra.reindex(index=range(spectra.shape[0]))
spectra = pd.DataFrame(spectra.values, columns=spectra.columns)

In [5]:
spectra.iloc[:,:18] = MinMaxScaler(feature_range=(0,1)).fit_transform(spectra.iloc[:,:18].T).T

In [6]:
Anom_Names = spectra.filter(regex='_a$').columns[:-1]

In [9]:
print(*Anom_Names, sep='\n')

Density_a
pH_a
Protein_a
Bilirubin_a
Glucose_a
Ketones_a
Leukocyte_a
Nitrite_a
Urobilinogen_a
Blood_a
Erythrocyte_a
Squamous cells_a
Hyaline cylinders_a
Bacteria_a
Crystals_a
Ferment_a
Small cells_a
Pathological cylinders_a
Slime_a
Spermatozoon_a


In [None]:
PLSRegression().fit()

In [9]:
Best_trsh_regul_dict = { 'Density_a': (0.24, 0.001),
                         'pH_a': (0.02, 0.1),
                         'Protein_a': (0.1, 0.01),
                         'Glucose_a': (0.02, 0.001),
                         'Ketones_a': (0.02, 0.01),
                         'Leukocyte_a': (0.02, 0.1),
                         'Nitrite_a': (0.06, 0.01),
                         'Urobilinogen_a': (0.02, 0.01),
                         'Blood_a': (0.12, 0.001),
                         'Erythrocyte_a': (0.16, 0.001),
                         'Squamous cells_a': (0.02, 0.01),
                         'Hyaline cylinders_a': (0.04, 0.1),
                         'Bacteria_a': (0.2, 0.001),
                         'Crystals_a': (0.14, 0.001),
                         'Ferment_a': (0.02, 0.001),
                         'Small cells_a': (0.06, 0.01),
                         'Pathological cylinders_a': (0.06, 0.1),
                         'Slime_a': (0.12, 0.01),
                         'Spermatozoon_a': (0.98, 1000) }

In [13]:
def predict(clf, X, trsh):
    probs = clf.predict_proba(X)
    preds = list(map(lambda x: 1 if x>=trsh else 0, probs[:,1]))
    return preds

def metrics_ret(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='binary')
    rec = recall_score(y_true, y_pred, average='binary')
#     f_score = f1_score(y_true, y_pred, average='binary')
    f_score = fbeta_score(y_true, y_pred, beta=5, average='binary')
    roc_auc = roc_auc_score(y_true, y_prob, average='micro')
    
    return [acc, prec, rec, f_score, roc_auc]

@interact
def to_show(selected_target = ['Density_a', 'pH_a', 'Protein_a',
                               'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a',
                               'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a',
                               'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a',
                               'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a',
                               'Slime_a', 'Spermatozoon_a']):
    

    X = spectra.iloc[:,:18]
    y = spectra[selected_target].astype(float)
    group = spectra.Dataset
    
#     LDA = LinearDiscriminantAnalysis(n_components=1)
#     LDA_spectra = pd.DataFrame(LDA.fit_transform(X.drop(['f','m','Age'], axis=1), y))
#     LDA_spectra['target'] = y.values
    
    
    fig = plt.figure(figsize=(15,6))
#     ax1 = fig.add_subplot(121)
#     ax1.set_title('Distribution')
#     ax1.set_xlabel('LDA Component')
#     sns.distplot(LDA_spectra[LDA_spectra.target == 0][0], label='Normal: {}'.format(y.value_counts().values[0]))
#     sns.distplot(LDA_spectra[LDA_spectra.target == 1][0], label='Anomaly: {}'.format(y.value_counts().values[1]))
#     plt.legend()
#     plt.grid()

    ax2 = fig.add_subplot(122)
    ax2.set_title('ROC Curve')
    ax2.set_xlabel('FPR')
    ax2.set_ylabel('TPR')

    Metrics = []
    FPR, TPR = [], []
    
#     skf = StratifiedKFold(n_splits=10)
    TRAIN_ARR, TEST_ARR = SGKF(X, group, y, n_splits=5)
    
    for train_data, test_data in zip(TRAIN_ARR, TEST_ARR):
        Train_X, Test_X = train_data.drop(['Dataset', 'TOTAL_a'], axis=1),\
                          test_data.drop(['Dataset', 'TOTAL_a'], axis=1)
        Train_y, Test_y = train_data['TOTAL_a'], test_data['TOTAL_a']
        
        LDA_CV = LinearDiscriminantAnalysis(n_components=1)
        LDA_Train_X = pd.DataFrame(LDA_CV.fit_transform(Train_X, Train_y))
        LDA_Test_X = pd.DataFrame(LDA_CV.transform(Test_X))
        
        
        log_reg = LogisticRegression(C=Best_trsh_regul_dict[selected_target][1])\
                            .fit(LDA_Train_X, Train_y)
        Preds = predict(log_reg, LDA_Test_X, Best_trsh_regul_dict[selected_target][0]) #log_reg.predict(LDA_Test_X)
        Probs = log_reg.predict_proba(LDA_Test_X)
        Metrics.append(metrics_ret(Test_y, Preds, Probs[:,1]))
        fpr, tpr, _ = roc_curve(Test_y, Probs[:,1])
        plt.plot(fpr,tpr, alpha=0.5)
        
    plt.plot(range(2), range(2), c='r', linewidth=3, label='Худший случай')
    plt.plot([0,0,1], [0,1,1], c='g', linewidth=3, label='Лучший случай')
    plt.legend()
    plt.grid()
    
    LDA_Test_X['target'] = Test_y.values
    y=pd.Series(Test_y.values)
    ax1 = fig.add_subplot(121)
    ax1.set_title('Distribution')
    ax1.set_xlabel('LDA Component')
    sns.distplot(LDA_Test_X[LDA_Test_X.target == 0][0], label='Normal: {}'.format(y.value_counts().values[0]))
    sns.distplot(LDA_Test_X[LDA_Test_X.target == 1][0], label='Anomaly: {}'.format(y.value_counts().values[1]))
    plt.legend()
    plt.grid()
        
    Metrics = pd.DataFrame(Metrics, columns=['Accuracy', 'Precision', 'Recall', 'F-score', 'ROC-AUC'])
    return Metrics.describe()

interactive(children=(Dropdown(description='selected_target', options=('Density_a', 'pH_a', 'Protein_a', 'Gluc…