In [1]:
from copy import deepcopy

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, roc_curve, cohen_kappa_score, fbeta_score

from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cross_decomposition import PLSRegression, PLSCanonical                       
from sklearn.decomposition import PCA

from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression

from StratifiedGroupKFold import StratifiedGroupKFold as SGKF

import catboost
from catboost import *

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [2]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
samples = pd.read_excel(u'20.01.23 список образцов Моча.xlsx', header=2)
samples.dropna(subset=['Gender'], inplace=True)

samples.fillna(0, inplace=True)
samples.reset_index(drop=True, inplace=True)

In [17]:
spectra1 = pd.read_csv('urine_chm001-250.csv')
spectra2 = pd.read_csv('urine_chm251-500.csv')
spectra_init = pd.concat([spectra1, spectra2],  ignore_index=True)

spectra = spectra_init.filter(regex='^Ch')

mmscaler = MinMaxScaler(feature_range=(-1,1))
spectra = pd.DataFrame(mmscaler.fit_transform(spectra.T).T)

# SS = StandardScaler()
# spectra = pd.DataFrame(SS.fit_transform(spectra))

del(spectra1, spectra2)

frame_lol = samples[['Gender','Age','Dataset']+list(samples.filter(regex=r'_a$').columns)]
spectra['Dataset'] = spectra_init['Dataset']
spectra = pd.merge(left=spectra, right=frame_lol, how='left', on='Dataset')
spectra.dropna(subset=samples.filter(regex=r'_a$').columns, inplace=True)
spectra = pd.DataFrame(spectra.values, columns=spectra.columns)

In [79]:
from StratifiedGroupKFold import StratifiedGroupKFold as SGKF

In [76]:
QDA = QuadraticDiscriminantAnalysis()

In [78]:
X = spectra.iloc[:,:36]
y = spectra
sgkf = SGKF()


QDA.fit()

SyntaxError: invalid syntax (<ipython-input-78-d2c00b0cf465>, line 1)

In [118]:
X = spectra.iloc[:,:36]
y = spectra.filter(regex='_a$').iloc[:,:-2]
PLS_Multi = pd.concat([pd.DataFrame(PLSCanonical(n_components=3).fit_transform(X,y)[0]), y], axis=1)


In [119]:
y.columns.astype(str)

Index(['Density_a', 'pH_a', 'Protein_a', 'Bilirubin_a', 'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a', 'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a', 'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a', 'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a', 'Slime_a'], dtype='object')

In [121]:
@interact
def to_plot(selected_anom = y.columns.astype(str)):
   
    fig, [ax1,ax2,ax3] = plt.subplots(1,3, figsize=(15,5))
    
    ax1.scatter(PLS_Multi[PLS_Multi[selected_anom]==0][0], PLS_Multi[PLS_Multi[selected_anom]==0][1])
    ax1.scatter(PLS_Multi[PLS_Multi[selected_anom]==1][0], PLS_Multi[PLS_Multi[selected_anom]==1][1])
    
    ax2.scatter(PLS_Multi[PLS_Multi[selected_anom]==0][0], PLS_Multi[PLS_Multi[selected_anom]==0][2])
    ax2.scatter(PLS_Multi[PLS_Multi[selected_anom]==1][0], PLS_Multi[PLS_Multi[selected_anom]==1][2])

    ax3.scatter(PLS_Multi[PLS_Multi[selected_anom]==0][1], PLS_Multi[PLS_Multi[selected_anom]==0][2])
    ax3.scatter(PLS_Multi[PLS_Multi[selected_anom]==1][1], PLS_Multi[PLS_Multi[selected_anom]==1][2])   

interactive(children=(Dropdown(description='selected_anom', options=('Density_a', 'pH_a', 'Protein_a', 'Biliru…

In [86]:
def metrics_ret(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='binary')
    rec = recall_score(y_true, y_pred, average='binary')
#     f_score = f1_score(y_true, y_pred, average='binary')
    f_score = fbeta_score(y_true, y_pred, beta=5, average='binary')
    roc_auc = roc_auc_score(y_true, y_prob, average='micro')
    
    return [acc, prec, rec, f_score, roc_auc]

def predict(clf, X, trsh):
    probs = clf.predict_proba(X)
    preds = list(map(lambda x: 1 if x>=trsh else 0, probs[:,1]))
    return preds



@interact
def to_show(selected_target = ['Density_a', 'pH_a', 'Protein_a',
                               'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a',
                               'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a',
                               'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a',
                               'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a',
                               'Slime_a', 'Spermatozoon_a']):
    

    X = spectra.iloc[:,:36]
    y = spectra[selected_target].astype(int)
    group = spectra.Dataset
    
#     QDA = LinearDiscriminantAnalysis(n_components=1)
#     QDA_spectra = pd.DataFrame(QDA.fit_transform(X.drop(['f','m','Age'], axis=1), y))
#     QDA_spectra['target'] = y.values
    
    
    fig = plt.figure(figsize=(15,6))
#     ax1 = fig.add_subplot(121)
#     ax1.set_title('Distribution')
#     ax1.set_xlabel('QDA Component')
#     sns.distplot(QDA_spectra[QDA_spectra.target == 0][0], label='Normal: {}'.format(y.value_counts().values[0]))
#     sns.distplot(QDA_spectra[QDA_spectra.target == 1][0], label='Anomaly: {}'.format(y.value_counts().values[1]))
#     plt.legend()
#     plt.grid()

    ax2 = fig.add_subplot(122)
    ax2.set_title('ROC Curve')
    ax2.set_xlabel('FPR')
    ax2.set_ylabel('TPR')

    Metrics = []
    FPR, TPR = [], []
    
#     skf = StratifiedKFold(n_splits=10)
    TRAIN_ARR, TEST_ARR = SGKF(X, group, y, n_splits=5)
    
    for train_data, test_data in zip(TRAIN_ARR, TEST_ARR):
        Train_X, Test_X = train_data.drop(['Dataset', 'TOTAL_a'], axis=1), test_data.drop(['Dataset', 'TOTAL_a'], axis=1)
        Train_y, Test_y = train_data['TOTAL_a'], test_data['TOTAL_a']
        
        QDA_CV = QuadraticDiscriminantAnalysis().fit(Train_X, Train_y)
        
        Preds = predict(QDA_CV, Test_X, 0.2) #log_reg.predict(QDA_Test_X)
        Probs = QDA_CV.predict_proba(Test_X)
        
        Metrics.append(metrics_ret(Test_y, Preds, Probs[:,1]))
        fpr, tpr, _ = roc_curve(Test_y, Probs[:,1])
        plt.plot(fpr,tpr, alpha=0.5)
        
    plt.plot(range(2), range(2), c='r', linewidth=3, label='Худший случай')
    plt.plot([0,0,1], [0,1,1], c='g', linewidth=3, label='Лучший случай')
    plt.legend()
    plt.grid()
    
#     QDA_Test_X['target'] = Test_y.values
#     y=pd.Series(Test_y.values)
#     ax1 = fig.add_subplot(121)
#     ax1.set_title('Distribution')
#     ax1.set_xlabel('QDA Component')
#     sns.distplot(QDA_Test_X[QDA_Test_X.target == 0][0], label='Normal: {}'.format(y.value_counts().values[0]))
#     sns.distplot(QDA_Test_X[QDA_Test_X.target == 1][0], label='Anomaly: {}'.format(y.value_counts().values[1]))
#     plt.legend()
#     plt.grid()
        
    Metrics = pd.DataFrame(Metrics, columns=['Accuracy', 'Precision', 'Recall', 'F-score', 'ROC-AUC'])
    return Metrics.describe()

interactive(children=(Dropdown(description='selected_target', options=('Density_a', 'pH_a', 'Protein_a', 'Gluc…