In [1]:
from copy import deepcopy

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, roc_curve, cohen_kappa_score, fbeta_score
from sklearn.model_selection import GroupKFold


import catboost
from catboost import *

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [2]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
samples = pd.read_excel(u'20.01.23 список образцов Моча.xlsx', header=2)
samples.dropna(subset=['Gender'], inplace=True)

samples.fillna(0, inplace=True)
samples.reset_index(drop=True, inplace=True)

In [4]:
spectra1 = pd.read_csv('urine_chm001-250.csv')
spectra2 = pd.read_csv('urine_chm251-500.csv')
spectra_init = pd.concat([spectra1, spectra2],  ignore_index=True)

spectra = spectra_init.filter(regex='^Ch')

mmscaler = MinMaxScaler(feature_range=(-1,1))
# spectra = pd.DataFrame(mmscaler.fit_transform(spectra.T).T)
del(spectra1, spectra2)

frame_lol = samples[['Gender','Age','Dataset']+list(samples.filter(regex=r'_a$').columns)]
spectra['Dataset'] = spectra_init['Dataset']
spectra = pd.merge(left=spectra, right=frame_lol, how='left', on='Dataset')
spectra.dropna(subset=samples.filter(regex=r'_a$').columns, inplace=True)

In [19]:
@interact
def to_show(selected_target = ['Density_a', 'pH_a', 'Protein_a',
                               'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a',
                               'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a',
                               'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a',
                               'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a',
                               'Slime_a', 'Spermatozoon_a']):
    
    X = spectra.iloc[:,:39]
    y = spectra[selected_target]
    groups = X.Dataset.values
    X.drop('Dataset', axis=1, inplace=True)
    
    gkf = GroupKFold(3)
    
    # counter
    cntr = 0
    plt_flag = False

    for train_indicies, test_indicies in gkf.split(X, y, groups):
        cntr+=1
        if cntr == 3: plt_flag = True

        Train_X, Test_X = X.iloc[train_indicies, :], X.iloc[test_indicies, :]
        Train_y , Test_y = y.iloc[train_indicies], y.iloc[test_indicies]

#         # initialize Pool
#         train_pool = Pool(Train_X, Train_y, cat_features=['Gender'])
#         test_pool = Pool(Test_X, cat_features=[1]) 

        # specify the training parameters 
        model = CatBoostClassifier(
            loss_function='Logloss',
            iterations=5000,
            random_seed=21,
            learning_rate=0.005,
            custom_loss=['AUC', 'Accuracy', 'Precision', 'Recall'],
            eval_metric='Precision',
            use_best_model=True,
            early_stopping_rounds=100,
            class_weights=(0.05, 0.95)
        )
        model.fit(
            Train_X, Train_y,
            cat_features=['Gender'],
            eval_set=(Test_X, Test_y),
            verbose=False,
            plot=plt_flag
        )
        # make the prediction using the resulting model
        preds = model.predict(Test_X)
        print(classification_report(Test_y, preds))
        

interactive(children=(Dropdown(description='selected_target', options=('Density_a', 'pH_a', 'Protein_a', 'Gluc…

## CV for all Anomalies : threshold finding

In [20]:
from tqdm import tqdm_notebook
from itertools import product

In [21]:
def predict(clf, X, trsh):
    probs = clf.predict_proba(X)
    preds = list(map(lambda x: 1 if x>=trsh else 0, probs[:,1]))
    return preds

def metrics_ret(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cohen_kappa = cohen_kappa_score(y_true, y_pred)
    return np.array([acc, prec, rec, f1, cohen_kappa])

In [22]:
Anomaly_Cols = ['Density_a', 'pH_a', 'Protein_a',
               'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a',
               'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a',
               'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a',
               'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a',
               'Slime_a', 'Spermatozoon_a']

Metrics = ['Accuracy', 'Precision', 'Recall', 'F-Score', 'Cohen`s kappa', 'Norm/Anom']

Exam_results_df = pd.DataFrame(index=pd.MultiIndex.from_product([Anomaly_Cols,['CV', 'EXAM']]), columns=Metrics)

In [24]:
## Поиск порога

def max_metric(name, y_true, y_pred):
    if name=='precision':
        return precision_score(y_true, y_pred)
    elif name=='recall':
        return recall_score(y_true, y_pred)
    elif name=='f-score':
        return fbeta_score(y_true, y_pred, beta=1)
    elif name=='roc-auc':
        return roc_auc_score(y_true, y_pred)
    elif name=='cohen':
        return cohen_kappa_score(y_true, y_pred)
    

Best_trsh_regul_dict = {}


X = spectra.iloc[:,:39]

for ANOMAL in tqdm_notebook(Anomaly_Cols):
    Metrics = dict()
    y = spectra[ANOMAL]
    Groups = X.Dataset.values
    X.drop('Dataset', axis=1, inplace=True)


    BEST_TRSH = 0.5
    gkf = GroupKFold(4)
    for TRSH, LR in product(np.arange(0.05,1.0,0.05), [0.08, 0.1, 0.12]):
        temp_results = []
        for train_inds, test_inds in gkf.split(X, y, Groups):
            Train_X, Train_y = X.iloc[train_inds,:], y.iloc[train_inds]
            Test_X, Test_y =  X.iloc[test_inds,:], y.iloc[test_inds]

#             LDA = LinearDiscriminantAnalysis(n_components=1)
#             LDA_model = LDA.fit(Train_X, Train_y)
#             LDA_train = pd.DataFrame(LDA_model.transform(Train_X))
#             LDA_train['target'] = Train_y.values
#             LDA_test = pd.DataFrame(LDA_model.transform(Test_X))
#             LDA_test['target'] = Test_y.values

            model = CatBoostClassifier(
            loss_function='Logloss',
            iterations=500,
            random_seed=21,
            learning_rate=LR,
            custom_loss=['AUC', 'Accuracy', 'Precision', 'Recall'],
            #eval_metric='AUC',
            use_best_model=True,
            early_stopping_rounds=20,
            class_weights=(0.05, 0.95)
            )
    
            model.fit(
                Train_X, Train_y,
                cat_features=['Gender'],
                eval_set=(Test_X, Test_y),
                verbose=False,
                plot=False
            )
            Preds = predict(model, Test_X, TRSH)
            temp_results.append(max_metric('f-score', Test_y, Preds))
            
        Metrics[np.mean(temp_results)] = (TRSH.round(2), LR)
    
    BEST_TRSH_LR = Metrics[max(Metrics)]
    Best_trsh_regul_dict[ANOMAL] = BEST_TRSH_LR

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))

AttributeError: 'DataFrame' object has no attribute 'Dataset'