In [301]:
import warnings

warnings.filterwarnings("ignore")

In [302]:
import pandas as pd
import xlwt
import numpy as np
import math
import statistics
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import (
    GroupKFold,
    KFold,
    cross_val_predict,
    train_test_split,
)
from imblearn.metrics import specificity_score, sensitivity_score

# Импортируем данные

## Данные по пациентам из анализов поликлинники

In [303]:
pacients = pd.read_excel(
    "../Data/20.02.04 список образцов Моча.xlsx", header=2
).convert_dtypes()

# Те пациенты у которых возраст отсутствует - Миша не занес в таблицу
# У них у всех нет ни одной аномалии
# Оставим это на экзамен
pacients.Age.fillna(0, inplace=True)
Exam_Pacients = pacients[pacients.Age == 0]["Dataset"].values

pacients = pacients[pacients.Age != 0]

## Спектральные данные

In [304]:
Specs_Chms_1_250 = pd.read_csv("../Data/urine_chm001-250.csv")
Specs_Chms_251_500 = pd.read_csv("../Data/urine_chm251-500.csv")

In [305]:
spectra = pd.read_excel("../Data/pi_cup.xlsx").convert_dtypes()
spectra.drop(columns=["Unnamed: 0"], inplace=True)
spectra.head()

Unnamed: 0,Ch_0_0,Ch_0_1,Ch_0_2,Ch_0_3,Ch_0_4,Ch_0_5,Ch_0_6,Ch_0_7,Ch_0_8,Ch_0_9,...,Squamous cells_a,Hyaline cylinders_a,Bacteria_a,Crystals_a,Ferment_a,Small cells_a,Pathological cylinders_a,Slime_a,Spermatozoon_a,TOTAL_a
0,116.2066,86.6787,501.8073,1966.517,1211.834,3746.119,420.2404,1028.663,1396.969,233.9503,...,0,1,1,0,0,0,1,0,0,8
1,110.4915,89.53624,495.1797,2054.57,1219.735,3993.029,410.4901,1043.288,1414.642,244.8904,...,0,1,1,0,0,0,1,0,0,8
2,103.8239,81.91614,484.7648,1963.676,1213.81,3925.869,400.7397,1040.363,1449.987,248.2566,...,0,1,1,0,0,0,1,0,0,8
3,109.539,85.72619,510.3286,2041.314,1270.105,4032.534,412.4402,1055.964,1477.758,249.0982,...,0,1,1,0,0,0,1,0,0,8
4,105.729,82.86865,488.552,1967.463,1214.797,3926.857,408.54,1042.313,1459.244,248.2566,...,0,1,1,0,0,0,1,0,0,8


# Разбиение категорий

In [142]:
X = spectra.iloc[:, 0:36]
X["Dataset"] = spectra["Dataset"]

# 1-ая категория : Интегральные характеристики
y1 = spectra.loc[:, ["Density_a", "pH_a"]]
y1["Dataset"] = spectra["Dataset"]

# 2-ая категория : Интегральные характеристики
y2 = spectra.loc[
    :,
    [
        "Protein_a",
        "Bilirubin_a",
        "Glucose_a",
        "Ketones_a",
        "Nitrite_a",
        "Urobilinogen_a",
    ],
]
y2["Dataset"] = spectra["Dataset"]

# 3-ья категория : Биологические компоненты
y3 = spectra.loc[
    :,
    [
        "Leukocyte_a",
        "Blood_a",
        "Erythrocyte_a",
        "Leukocyte_a.1",
        "Squamous cells_a",
        "Hyaline cylinders_a",
        "Bacteria_a",
        "Crystals_a",
        "Ferment_a",
        "Small cells_a",
        "Pathological cylinders_a",
        "Slime_a",
        "Spermatozoon_a",
    ],
]
y3["Dataset"] = spectra["Dataset"]

# Построение модели

## Необходимые утилиты

In [312]:
from sklearn.model_selection import ShuffleSplit

In [313]:
def sigmoid(x):
    return 1 / (1 + np.exp(10 * (-x + 0.5)))


def medical_metrics(y_test, y_pred):
    sensitivity = sensitivity_score(y_test, y_pred)
    specificity = specificity_score(y_test, y_pred)
    return sensitivity, specificity


def PLS_fit_predict(X_train, X_test, y_train, y_test, n_comp, condition=0.5):
    pls = PLSRegression(n_components=n_comp)
    pls.fit(X_train, y_train)
    pls_result = pls.predict(X_test)
    pls_result_train = pls.predict(X_train)
    pls_sigmoid = sigmoid(pls_result)
    pls_sigmoid_train = sigmoid(pls_result_train)
    binary_prediction = pls_sigmoid > condition
    binary_prediction_train = pls_sigmoid_train > condition
    y_pred = pd.DataFrame(binary_prediction)
    y_pred.columns = list(y_test)
    y_pred_train = pd.DataFrame(binary_prediction_train)
    y_pred_train.columns = list(y_train)

    return y_pred_train, y_pred


def calculate_metrics(y_test, y_pred, iteration, n_comp, Anomaly, condition):
    metrics_list = []
    metrics_list.append(["iteration", "n_comp", "Anomaly", "condition", "sens", "spec"])
    true = y_test[Anomaly]
    pred = y_pred[Anomaly]
    sens, spec = medical_metrics(true, pred)
    metrics_list.append([iteration, n_comp, Anomaly, condition, sens, spec])
    metrics_data = pd.DataFrame(metrics_list)
    new_header = metrics_data.iloc[0]
    metrics_data = metrics_data[1:]
    metrics_data.columns = new_header

    return metrics_data


def split_data(X, y, Anomaly):
    groups = X.drop_duplicates("Dataset").Dataset.values
    labels = y.drop_duplicates("Dataset")
    train_g, test_g, _, _ = train_test_split(
        groups, labels, stratify=labels[Anomaly], test_size=0.2
    )
    train_index = X[X.Dataset.isin(train_g)].index
    test_index = X[X.Dataset.isin(test_g)].index
    X_train, X_test, y_train, y_test = (
        X.iloc[train_index],
        X.iloc[test_index],
        y.iloc[train_index],
        y.iloc[test_index],
    )
    X_train.drop(columns="Dataset", inplace=True)
    X_test.drop(columns="Dataset", inplace=True)
    y_train.drop(columns="Dataset", inplace=True)
    y_test.drop(columns="Dataset", inplace=True)

    return X_train, X_test, y_train, y_test


def cross_val(X, y, n_comp, N_splits=50, condition=0.5):
    anomaly_list = list(y)
    anomaly_list.pop()
    stratified_by = np.random.choice(anomaly_list, size=1)

    groups = X.drop_duplicates("Dataset").Dataset.values
    labels = y.drop_duplicates("Dataset")

    metrics_data_train = pd.DataFrame()
    metrics_data = pd.DataFrame()

    #     kf = KFold(n_splits=N_splits, shuffle=True, random_state=42)
    ss = ShuffleSplit(n_splits=N_splits, test_size=0.3, random_state=42)

    for anomaly_name in anomaly_list:
        iteration = 0
        for train_inds, test_inds in ss.split(groups, labels):
            iteration += 1

            train_g, test_g = groups[train_inds], groups[test_inds]
            X_train, X_test = (
                X[X.Dataset.isin(train_g)].drop("Dataset", axis=1),
                X[X.Dataset.isin(test_g)].drop("Dataset", axis=1),
            )
            y_train, y_test = (
                y[y.Dataset.isin(train_g)].drop("Dataset", axis=1).astype("bool"),
                y[y.Dataset.isin(test_g)].drop("Dataset", axis=1).astype("bool"),
            )

            y_pred_train, y_pred = PLS_fit_predict(
                X_train, X_test, y_train, y_test, n_comp, condition
            )

            to_concat = calculate_metrics(
                y_test, y_pred, iteration, n_comp, anomaly_name, condition
            )
            metrics_data = pd.concat([metrics_data, to_concat])
            metrics_data = metrics_data.reset_index(drop=True)

            to_concat_train = calculate_metrics(
                y_train, y_pred_train, iteration, n_comp, anomaly_name, condition
            )
            metrics_data_train = pd.concat([metrics_data_train, to_concat_train])
            metrics_data_train = metrics_data_train.reset_index(drop=True)

    return metrics_data_train, metrics_data

## Подбор гиперпараметров

In [314]:
from itertools import product
from tqdm.notebook import tqdm

N_Comps_ARR = np.arange(2, 36)
Condition_ARR = np.arange(0.01, 0.35, 0.01)

Category_1_Metrics_Train = pd.DataFrame()
Category_1_Metrics_Test = pd.DataFrame()

Pairs = list(product(N_Comps_ARR, Condition_ARR))
for N_comps, condition in tqdm(Pairs, total=len(Pairs)):
    to_concat_train, to_concat_test = cross_val(
        X, y1, n_comp=N_comps, N_splits=50, condition=condition
    )
    Category_1_Metrics_Train = pd.concat([Category_1_Metrics_Train, to_concat_train])
    Category_1_Metrics_Test = pd.concat([Category_1_Metrics_Test, to_concat_test])

HBox(children=(FloatProgress(value=0.0, max=1156.0), HTML(value='')))




In [346]:
N_Comps_ARR = np.arange(2, 36)
Condition_ARR = np.arange(0.01, 0.35, 0.01)

Category_2_Metrics_Train = pd.DataFrame()
Category_2_Metrics_Test = pd.DataFrame()

Pairs = list(product(N_Comps_ARR, Condition_ARR))
for N_comps, condition in tqdm(Pairs, total=len(Pairs)):
    to_concat_train, to_concat_test = cross_val(
        X, y2, n_comp=N_comps, N_splits=50, condition=condition
    )
    Category_2_Metrics_Train = pd.concat([Category_2_Metrics_Train, to_concat_train])
    Category_2_Metrics_Test = pd.concat([Category_2_Metrics_Test, to_concat_test])

HBox(children=(FloatProgress(value=0.0, max=1156.0), HTML(value='')))




In [347]:
N_Comps_ARR = np.arange(2, 36)
Condition_ARR = np.arange(0.01, 0.35, 0.01)

Category_3_Metrics_Train = pd.DataFrame()
Category_3_Metrics_Test = pd.DataFrame()

Pairs = list(product(N_Comps_ARR, Condition_ARR))
for N_comps, condition in tqdm(Pairs, total=len(Pairs)):
    to_concat_train, to_concat_test = cross_val(
        X, y3, n_comp=N_comps, N_splits=50, condition=condition
    )
    Category_3_Metrics_Train = pd.concat([Category_3_Metrics_Train, to_concat_train])
    Category_3_Metrics_Test = pd.concat([Category_3_Metrics_Test, to_concat_test])

HBox(children=(FloatProgress(value=0.0, max=1156.0), HTML(value='')))




In [348]:
def my_fbeta_score(sens, spec, beta=1):
    try:
        res = (1 + beta ** 2) * (spec * sens) / (beta ** 2 * spec + sens)
    except ZeroDivisionError:
        res = 0
    return res

In [349]:
Category_1_Metrics_Train = Category_1_Metrics_Train.convert_dtypes()
Category_2_Metrics_Train = Category_2_Metrics_Train.convert_dtypes()
Category_3_Metrics_Train = Category_3_Metrics_Train.convert_dtypes()

Category_1_Metrics_Test = Category_1_Metrics_Test.convert_dtypes()
Category_2_Metrics_Test = Category_2_Metrics_Test.convert_dtypes()
Category_3_Metrics_Test = Category_3_Metrics_Test.convert_dtypes()

In [350]:
Category_1_Metrics_Train["harm_avg"] = my_fbeta_score(
    Category_1_Metrics_Train.sens.values, Category_1_Metrics_Train.spec.values
)

Category_1_Metrics_Test["harm_avg"] = my_fbeta_score(
    Category_1_Metrics_Test.sens.values, Category_1_Metrics_Test.spec.values
)


Category_2_Metrics_Train["harm_avg"] = my_fbeta_score(
    Category_2_Metrics_Train.sens.values, Category_2_Metrics_Train.spec.values
)

Category_2_Metrics_Test["harm_avg"] = my_fbeta_score(
    Category_2_Metrics_Test.sens.values, Category_2_Metrics_Test.spec.values
)


Category_3_Metrics_Train["harm_avg"] = my_fbeta_score(
    Category_3_Metrics_Train.sens.values, Category_3_Metrics_Train.spec.values
)

Category_3_Metrics_Test["harm_avg"] = my_fbeta_score(
    Category_3_Metrics_Test.sens.values, Category_3_Metrics_Test.spec.values
)

In [351]:
def CV_Average(Metrics_Data, N_Splits=50):

    Anomaly_list = Metrics_Data.iloc[::N_Splits].Anomaly.values.astype(str)
    N_Comps = Metrics_Data.iloc[::N_Splits].n_comp.values.astype(int)
    Conditions = Metrics_Data.iloc[::N_Splits].condition.values.astype(float)

    Mean_Sens = np.zeros(shape=(Metrics_Data.shape[0] // N_Splits))
    Mean_Spec = np.zeros(shape=(Metrics_Data.shape[0] // N_Splits))
    Mean_Harm = np.zeros(shape=(Metrics_Data.shape[0] // N_Splits))

    for i in range(N_Splits):
        Mean_Sens += Metrics_Data.iloc[i::N_Splits].sens.values.astype(float)
        Mean_Spec += Metrics_Data.iloc[i::N_Splits].spec.values.astype(float)
        Mean_Harm += Metrics_Data.iloc[i::N_Splits].harm_avg.values.astype(float)

    Mean_Sens /= N_Splits
    Mean_Spec /= N_Splits
    Mean_Harm /= N_Splits

    Result = pd.DataFrame(
        columns=[
            "N_Comp",
            "Condition",
            "Anomaly",
            "Mean_Sens",
            "Mean_Spec",
            "Mean_Harm",
        ]
    )
    Result["N_Comp"] = N_Comps
    Result["Condition"] = Conditions
    Result["Anomaly"] = Anomaly_list
    Result["Mean_Sens"] = Mean_Sens
    Result["Mean_Spec"] = Mean_Spec
    Result["Mean_Harm"] = Mean_Harm

    return Result

In [355]:
Category_1_Average_Metrics_Train = CV_Average(Category_1_Metrics_Train)
Category_2_Average_Metrics_Train = CV_Average(Category_2_Metrics_Train)
Category_3_Average_Metrics_Train = CV_Average(Category_3_Metrics_Train)

Category_1_Average_Metrics_Test = CV_Average(Category_1_Metrics_Test)
Category_2_Average_Metrics_Test = CV_Average(Category_2_Metrics_Test)
Category_3_Average_Metrics_Test = CV_Average(Category_3_Metrics_Test)

### Результат : лучшие гиперпараметры

#### Категория 1

In [366]:
best = Category_1_Average_Metrics_Train.groupby("Anomaly").max().Mean_Harm.values

Cat1Train_BestHyperParams = Category_1_Average_Metrics_Train[
    Category_1_Average_Metrics_Train.Mean_Harm.isin(best)
]
Cat1Train_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
1905,30,0.01,pH_a,0.970339,0.714091,0.821216
2300,35,0.29,Density_a,0.737623,0.78857,0.761544


In [367]:
best = Category_1_Average_Metrics_Test.groupby("Anomaly").max().Mean_Harm.values

Cat1Test_BestHyperParams = Category_1_Average_Metrics_Test[
    Category_1_Average_Metrics_Test.Mean_Harm.isin(best)
]
Cat1Test_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
262,5,0.3,Density_a,0.637999,0.775508,0.696446
1769,28,0.01,pH_a,0.497969,0.686016,0.552139


#### Категория 2

In [368]:
best = Category_2_Average_Metrics_Train.groupby("Anomaly").max().Mean_Harm.values

Cat2Train_BestHyperParams = Category_2_Average_Metrics_Train[
    Category_2_Average_Metrics_Train.Mean_Harm.isin(best[1:])
]
Cat1Train_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
1905,30,0.01,pH_a,0.970339,0.714091,0.821216
2300,35,0.29,Density_a,0.737623,0.78857,0.761544


In [370]:
best = Category_2_Average_Metrics_Test.groupby("Anomaly").max().Mean_Harm.values

Cat2Test_BestHyperParams = Category_2_Average_Metrics_Test[
    Category_2_Average_Metrics_Test.Mean_Harm.isin(best[1:])
]
Cat2Test_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
4,2,0.01,Nitrite_a,0.98,0.746831,0.837707
207,3,0.01,Ketones_a,0.62177,0.681534,0.618298
414,4,0.02,Protein_a,0.712015,0.839791,0.759406
821,6,0.01,Urobilinogen_a,0.7939,0.73768,0.740234
1838,11,0.01,Glucose_a,0.577915,0.647676,0.586858


#### Категория 3

In [372]:
best = Category_3_Average_Metrics_Train.groupby("Anomaly").max().Mean_Harm.values

Cat3Train_BestHyperParams = Category_3_Average_Metrics_Train[
    Category_3_Average_Metrics_Train.Mean_Harm.isin(best)
]
Cat3Train_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
12,2,0.01,Spermatozoon_a,0.0,0.96,
25,2,0.02,Spermatozoon_a,0.0,0.96,
38,2,0.03,Spermatozoon_a,0.0,0.96,
51,2,0.04,Spermatozoon_a,0.0,0.96,
64,2,0.05,Spermatozoon_a,0.0,0.96,
...,...,...,...,...,...,...
14975,35,0.30,Spermatozoon_a,0.0,0.96,
14988,35,0.31,Spermatozoon_a,0.0,0.96,
15001,35,0.32,Spermatozoon_a,0.0,0.96,
15014,35,0.33,Spermatozoon_a,0.0,0.96,


In [373]:
best = Category_3_Average_Metrics_Test.groupby("Anomaly").max().Mean_Harm.values

Cat3Test_BestHyperParams = Category_3_Average_Metrics_Test[
    Category_3_Average_Metrics_Test.Mean_Harm.isin(best[:-2])
]
Cat3Test_BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
483,3,0.04,Erythrocyte_a,0.671399,0.812607,0.729886
906,4,0.02,Small cells_a,0.636521,0.879549,0.725514
917,4,0.03,Crystals_a,0.737503,0.659348,0.688897
929,4,0.04,Bacteria_a,0.82529,0.793413,0.806254
1363,5,0.03,Slime_a,0.888426,0.847324,0.864273
4443,12,0.02,Pathological cylinders_a,0.652291,0.854299,0.726754
5827,15,0.07,Leukocyte_a.1,0.720262,0.759239,0.735845
6635,17,0.01,Hyaline cylinders_a,0.936189,0.679402,0.781306
7111,18,0.04,Leukocyte_a,0.728951,0.736013,0.72756
9309,23,0.03,Blood_a,0.663548,0.741297,0.694077


### Для всех аномалий

In [374]:
BestHyperParams = pd.concat(
    [Cat1Test_BestHyperParams, Cat2Test_BestHyperParams, Cat3Test_BestHyperParams],
    axis=0,
)

In [375]:
BestHyperParams

Unnamed: 0,N_Comp,Condition,Anomaly,Mean_Sens,Mean_Spec,Mean_Harm
262,5,0.3,Density_a,0.637999,0.775508,0.696446
1769,28,0.01,pH_a,0.497969,0.686016,0.552139
4,2,0.01,Nitrite_a,0.98,0.746831,0.837707
207,3,0.01,Ketones_a,0.62177,0.681534,0.618298
414,4,0.02,Protein_a,0.712015,0.839791,0.759406
821,6,0.01,Urobilinogen_a,0.7939,0.73768,0.740234
1838,11,0.01,Glucose_a,0.577915,0.647676,0.586858
483,3,0.04,Erythrocyte_a,0.671399,0.812607,0.729886
906,4,0.02,Small cells_a,0.636521,0.879549,0.725514
917,4,0.03,Crystals_a,0.737503,0.659348,0.688897


In [400]:
BestHyperParams.to_csv("./Saved_Models/BestHyperParams.csv", index=0)

In [511]:
Category_1_Metrics_Train.to_csv("./Saved_Models/Category_1_Metrics_Train.csv", index=0)
Category_1_Metrics_Test.to_csv("./Saved_Models/Category_1_Metrics_Test.csv", index=0)
Category_1_Average_Metrics_Train.to_csv(
    "./Saved_Models/Category_1_Average_Metrics_Train.csv", index=0
)
Category_1_Average_Metrics_Test.to_csv(
    "./Saved_Models/Category_1_Average_Metrics_Test.csv", index=0
)

Category_2_Metrics_Train.to_csv("./Saved_Models/Category_2_Metrics_Train.csv", index=0)
Category_2_Metrics_Test.to_csv("./Saved_Models/Category_2_Metrics_Test.csv", index=0)
Category_2_Average_Metrics_Train.to_csv(
    "./Saved_Models/Category_2_Average_Metrics_Train.csv", index=0
)
Category_2_Average_Metrics_Test.to_csv(
    "./Saved_Models/Category_2_Average_Metrics_Test.csv", index=0
)

Category_3_Metrics_Train.to_csv("./Saved_Models/Category_3_Metrics_Train.csv", index=0)
Category_3_Metrics_Test.to_csv("./Saved_Models/Category_3_Metrics_Test.csv", index=0)
Category_3_Average_Metrics_Train.to_csv(
    "./Saved_Models/Category_3_Average_Metrics_Train.csv", index=0
)
Category_3_Average_Metrics_Test.to_csv(
    "./Saved_Models/Category_3_Average_Metrics_Test.csv", index=0
)


## Обучение финальной модели

[Сохранение ML модели](https://www.geeksforgeeks.org/saving-a-machine-learning-model/)

[Создание кастомного классификатора](http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/)

[Пример стекинга от Дьяконова](https://github.com/Dyakonov/ml_hacks/blob/master/dj_stacking.ipynb)

In [506]:
Category_1 = set(y1.columns[:-1])
Category_2 = set(y2.columns[:-1])
Category_3 = set(y3.columns[:-1])

In [432]:
# %%pycodestyle
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cross_decomposition import PLSRegression

Params = pd.read_csv("./Saved_Models/BestHyperParams.csv")

class UrineAnomalyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, Params = Params):
#         self.n_components = n_components
#         self.clf_thrs = clf_thrs
        self.Params = Params
        self
    
    def fit(self, X, y):
        raise NotImplementedError
        
    def predict(self, X):
        raise NotImplementedError
        
    def 

In [433]:
clf = UrineAnomalyClassifier()

In [396]:
STD_df = Category_3_Metrics_Test.groupby(["Anomaly", "n_comp", "condition"]).std()

In [397]:
STD_df.groupby("Anomaly").mean()

Unnamed: 0_level_0,iteration,sens,spec,harm_avg
Anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bacteria_a,14.57738,0.093508,0.031414,0.079306
Blood_a,14.57738,0.101459,0.026967,0.109347
Crystals_a,14.57738,0.078073,0.034582,0.093053
Erythrocyte_a,14.57738,0.101084,0.032918,0.098115
Ferment_a,14.57738,0.007464,0.094509,0.007251
Hyaline cylinders_a,14.57738,0.178322,0.009587,0.228062
Leukocyte_a,14.57738,0.085563,0.034814,0.08058
Leukocyte_a.1,14.57738,0.079069,0.038748,0.059821
Pathological cylinders_a,14.57738,0.164283,0.01296,0.192152
Slime_a,14.57738,0.113221,0.021746,0.124701
