In [1]:
from copy import deepcopy

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, roc_curve, cohen_kappa_score, fbeta_score
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cross_decomposition import PLSRegression, PLSCanonical
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [2]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
samples = pd.read_excel(u'20.01.23 список образцов Моча.xlsx', header=2)
samples.dropna(subset=['Gender'], inplace=True)

samples.fillna(0, inplace=True)
samples.reset_index(drop=True, inplace=True)

In [4]:
spectra1 = pd.read_csv('urine_chm001-250.csv')
spectra2 = pd.read_csv('urine_chm251-500.csv')
spectra_init = pd.concat([spectra1, spectra2],  ignore_index=True)

spectra = spectra_init.filter(regex='^Ch')

mmscaler = MinMaxScaler(feature_range=(-1,1))
spectra = pd.DataFrame(mmscaler.fit_transform(spectra.T).T)
del(spectra1, spectra2)

frame_lol = samples[['Dataset']+list(samples.filter(regex=r'_a$').columns)]
spectra['Dataset'] = spectra_init['Dataset']
spectra = pd.merge(left=spectra, right=frame_lol, how='left', on='Dataset')
spectra.dropna(subset=samples.filter(regex=r'_a$').columns, inplace=True)

## PLS ( LDA ) + Differantation by 2 diods + LogReg

* Используем LDA для снижения размерности пространства до одного признака для каждого диода в отдельности, получаем двумерное признаковое описание
* На новых признаках используем LogReg

### Проблема
__Жесткая несбалансированность классов__

In [33]:
@interact
def to_show(selected_target = ['Density_a', 'pH_a', 'Protein_a',
                               'Glucose_a', 'Ketones_a', 'Leukocyte_a', 'Nitrite_a',
                               'Urobilinogen_a', 'Blood_a', 'Erythrocyte_a',
                               'Squamous cells_a', 'Hyaline cylinders_a', 'Bacteria_a',
                               'Crystals_a', 'Ferment_a', 'Small cells_a', 'Pathological cylinders_a',
                               'Slime_a', 'Spermatozoon_a']):
    X = spectra.iloc[:,:36]
    X1 = spectra.iloc[:,0:36:2]
    X2 = spectra.iloc[:,1:37:2]
    y = spectra[selected_target]
    
    LDA = LinearDiscriminantAnalysis(n_components=1)
    LDA_Data = pd.DataFrame(LDA.fit_transform(X,y))
    LDA_Data['target'] = y.values
    
    LDA_1 = LinearDiscriminantAnalysis(n_components=1)
    LDA_spectra = pd.DataFrame(LDA_1.fit_transform(X1,y))
    LDA_2 = LinearDiscriminantAnalysis(n_components=1)
    LDA_spectra[1] = LDA_2.fit_transform(X2,y)
    LDA_spectra['target'] = y.values
    
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,6))
    ax1.scatter(LDA_spectra[LDA_spectra.target==0][0], LDA_spectra[LDA_spectra.target==0][1])
    ax1.scatter(LDA_spectra[LDA_spectra.target==1][0], LDA_spectra[LDA_spectra.target==1][1])
    
    ax2
    sns.distplot(LDA_Data[LDA_Data.target==0][0])
    sns.distplot(LDA_Data[LDA_Data.target==1][0])

interactive(children=(Dropdown(description='selected_target', options=('Density_a', 'pH_a', 'Protein_a', 'Gluc…

# Распределения стали хуже, продолжать смысла нет