In [7]:
import pandas as pd
import os
import numpy as np

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score, classification_report

import matplotlib.pyplot as plt

In [18]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

In [46]:
PCA_Data = pd.read_csv('PCA_Data_Chms_1_223.csv')
AE_Data = pd.read_csv('AE_Data_Chms_1_223.csv')

In [47]:
# Add_Features = pd.read_csv('Urine_Chms_1_223_Dataset.csv')[['Пол', 'Возраст']]

# PCA_Data = pd.concat([Add_Features, PCA_Data], axis=1)
# AE_Data = pd.concat([Add_Features, AE_Data], axis=1)

PCA_Data['sex'] = [1 if x=='m' else 0 for x in pd.read_csv('Urine_Chms_1_223_Dataset.csv')['Пол']]
AE_Data['sex'] = [1 if x=='m' else 0 for x in pd.read_csv('Urine_Chms_1_223_Dataset.csv')['Пол']]

# train_test_split by Chms

In [49]:
chms = PCA_Data.Dataset.unique()

In [92]:
tr_val_chms, test_chms, _, _, = train_test_split(chms, chms, test_size=0.2, random_state=123)
train_chms, val_chms, _, _, = train_test_split(tr_val_chms, tr_val_chms, test_size=0.2, random_state=123)

train_inds = list(map(lambda x: x in train_chms, AE_Data.Dataset))
val_inds = list(map(lambda x: x in val_chms, AE_Data.Dataset))
test_inds = list(map(lambda x: x in test_chms, AE_Data.Dataset))

In [93]:
AE_Train_X = AE_Data[train_inds].drop(['TOTAL', 'Dataset'], axis=1)
AE_Train_y = AE_Data[train_inds]['TOTAL']

AE_Val_X = AE_Data[val_inds].drop(['TOTAL', 'Dataset'], axis=1)
AE_Val_y = AE_Data[val_inds]['TOTAL']

AE_Test_X = AE_Data[test_inds].drop(['TOTAL', 'Dataset'], axis=1)
AE_Test_y = AE_Data[test_inds]['TOTAL']


PCA_Train_X = PCA_Data[train_inds].drop(['TOTAL', 'Dataset'], axis=1)
PCA_Train_y = PCA_Data[train_inds]['TOTAL']

PCA_Val_X = PCA_Data[val_inds].drop(['TOTAL', 'Dataset'], axis=1)
PCA_Val_y = PCA_Data[val_inds]['TOTAL']

PCA_Test_X = PCA_Data[test_inds].drop(['TOTAL', 'Dataset'], axis=1)
PCA_Test_y = PCA_Data[test_inds]['TOTAL']

In [94]:
def to_bin(y):
    if y==0:
        return 0
    else:
        return 1

# SVC by PCA

In [95]:
for kern in kernels:
    model = SVC(kernel=kern, gamma='scale')
    model.fit(PCA_Train_X, PCA_Train_y.apply(to_bin))
    val_preds = model.predict(PCA_Val_X)
    print(kern, classification_report(PCA_Val_y.apply(to_bin), val_preds), sep='\n')

linear
              precision    recall  f1-score   support

           0       0.56      0.64      0.60       109
           1       0.80      0.74      0.77       212

    accuracy                           0.70       321
   macro avg       0.68      0.69      0.68       321
weighted avg       0.72      0.70      0.71       321

poly
              precision    recall  f1-score   support

           0       0.50      0.09      0.16       109
           1       0.67      0.95      0.79       212

    accuracy                           0.66       321
   macro avg       0.59      0.52      0.47       321
weighted avg       0.61      0.66      0.57       321

rbf
              precision    recall  f1-score   support

           0       0.58      0.66      0.62       109
           1       0.81      0.75      0.78       212

    accuracy                           0.72       321
   macro avg       0.69      0.71      0.70       321
weighted avg       0.73      0.72      0.72       321

sig

# SVC by AE

In [96]:
for kern in kernels:
    model = SVC(kernel=kern, gamma='scale')
    model.fit(AE_Train_X, AE_Train_y.apply(to_bin))
    val_preds = model.predict(AE_Val_X)
    print(kern, classification_report(AE_Val_y.apply(to_bin), val_preds), sep='\n')

linear
              precision    recall  f1-score   support

           0       0.59      0.65      0.62       109
           1       0.81      0.77      0.79       212

    accuracy                           0.73       321
   macro avg       0.70      0.71      0.70       321
weighted avg       0.74      0.73      0.73       321

poly
              precision    recall  f1-score   support

           0       0.58      0.63      0.61       109
           1       0.80      0.77      0.79       212

    accuracy                           0.72       321
   macro avg       0.69      0.70      0.70       321
weighted avg       0.73      0.72      0.73       321

rbf
              precision    recall  f1-score   support

           0       0.56      0.68      0.61       109
           1       0.81      0.72      0.77       212

    accuracy                           0.71       321
   macro avg       0.69      0.70      0.69       321
weighted avg       0.73      0.71      0.71       321

sig

In [97]:
Spec_Data

Unnamed: 0,Ch_0_0,Ch_0_1,Ch_0_2,Ch_0_3,Ch_0_4,Ch_0_5,Ch_0_6,Ch_0_7,Ch_0_8,Ch_0_9,...,Ch_1_10,Ch_1_11,Ch_1_12,Ch_1_13,Ch_1_14,Ch_1_15,Ch_1_16,Ch_1_17,TOTAL,sex
0,0.009548,0.006474,0.049685,0.202149,0.123593,0.387389,0.041195,0.104526,0.142864,0.021804,...,0.074371,0.075371,0.041148,0.060607,0.009314,0.018939,0.008189,0.012748,8.0,1
1,0.008944,0.006740,0.049408,0.213435,0.125622,0.417335,0.040500,0.107062,0.146123,0.023081,...,0.074040,0.071008,0.040300,0.061966,0.009789,0.019652,0.008480,0.013700,8.0,1
2,0.007399,0.005264,0.044529,0.188678,0.115589,0.379931,0.036340,0.098683,0.138609,0.021477,...,0.070242,0.069050,0.036341,0.059691,0.008832,0.019432,0.007714,0.010939,8.0,1
3,0.008285,0.005911,0.048244,0.200886,0.123995,0.399413,0.038485,0.102645,0.144699,0.022199,...,0.072781,0.069820,0.038771,0.060423,0.009440,0.019763,0.008134,0.013665,8.0,1
4,0.007441,0.005255,0.044043,0.185445,0.113481,0.372787,0.036393,0.096989,0.136853,0.021068,...,0.070491,0.069071,0.036398,0.052239,0.008726,0.019684,0.007567,0.010917,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,0.006937,0.005250,0.044873,0.191446,0.120930,0.345861,0.032769,0.092170,0.135271,0.019835,...,0.069793,0.064008,0.038168,0.050066,0.009761,0.018438,0.008192,0.010365,0.0,1
2063,0.006998,0.005314,0.044793,0.192635,0.120639,0.346309,0.032710,0.092381,0.135095,0.019994,...,0.069804,0.063961,0.038272,0.049805,0.009794,0.018406,0.008252,0.010422,0.0,1
2064,0.007037,0.005417,0.045411,0.194302,0.122463,0.347262,0.033045,0.092674,0.136700,0.019846,...,0.069453,0.063510,0.038487,0.054845,0.009849,0.017754,0.008298,0.012211,0.0,1
2065,0.007006,0.005247,0.044992,0.191405,0.121087,0.344439,0.032824,0.092340,0.135709,0.019823,...,0.069617,0.065314,0.038318,0.051846,0.009806,0.019781,0.008262,0.010884,0.0,1


In [98]:
pd.read_csv('Urine_Chms_1_223_Dataset.csv')

Unnamed: 0,Ch_0_0,Ch_0_1,Ch_0_2,Ch_0_3,Ch_0_4,Ch_0_5,Ch_0_6,Ch_0_7,Ch_0_8,Ch_0_9,...,Ch_1_15,Ch_1_16,Ch_1_17,Пол,Возраст,Цвет,Плотность,Порядковый номер,Description,TOTAL
0,116.2066,86.67870,501.8073,1966.517,1211.834,3746.119,420.2404,1028.663,1396.969,233.9503,...,206.4316,103.1565,146.9493,m,24.0,COLORLESS,1.006,Chm1,Spectrum1,8.0
1,110.4915,89.53624,495.1797,2054.570,1219.735,3993.029,410.4901,1043.288,1414.642,244.8904,...,212.2925,106.0760,155.7079,m,24.0,COLORLESS,1.006,Chm1,Spectrum2,8.0
2,103.8239,81.91614,484.7648,1963.676,1213.810,3925.869,400.7397,1040.363,1449.987,248.2566,...,227.2702,107.0492,140.1371,m,24.0,COLORLESS,1.006,Chm1,Spectrum3,8.0
3,109.5390,85.72619,510.3286,2041.314,1270.105,4032.534,412.4402,1055.964,1477.758,249.0982,...,224.6654,108.0223,163.4933,m,24.0,COLORLESS,1.006,Chm1,Spectrum4,8.0
4,105.7290,82.86865,488.5520,1967.463,1214.797,3926.857,408.5400,1042.313,1459.244,248.2566,...,233.7822,107.0492,142.0834,m,24.0,COLORLESS,1.006,Chm1,Spectrum5,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,122.8742,100.96640,615.4241,2518.505,1602.940,4523.391,458.2668,1229.520,1789.131,290.3340,...,272.2032,139.1639,167.3860,m,75.0,LIGHT YELLOW,1.013,Chm223,Spectrum6,0.0
2063,123.8267,101.91890,615.4241,2538.387,1601.952,4537.218,458.2668,1234.395,1789.972,292.8587,...,272.2032,140.1371,168.3591,m,75.0,LIGHT YELLOW,1.013,Chm223,Spectrum7,0.0
2064,123.8267,102.87140,620.1581,2545.962,1616.767,4524.379,460.2169,1231.470,1800.912,289.4925,...,262.4352,140.1371,190.7421,m,75.0,LIGHT YELLOW,1.013,Chm223,Spectrum8,0.0
2065,123.8267,100.96640,617.3177,2519.451,1605.903,4507.589,459.2419,1232.445,1795.863,290.3340,...,289.7857,140.1371,174.1982,m,75.0,LIGHT YELLOW,1.013,Chm223,Spectrum9,0.0


In [99]:
Spec_Data = pd.read_csv('Urine_Chms_1_223_Dataset.csv')
cols = ['Пол', 'Возраст', 'Цвет', 'Плотность', 'Порядковый номер', 'Description']
Spec_Data['sex'] = [1 if x=='m' else 0 for x in Spec_Data['Пол']]
Spec_Data.drop(cols, axis=1, inplace=True)

mms = MinMaxScaler()
specs = Spec_Data.filter(regex='^Ch')
Spec_Data.loc[:, specs.columns] = mms.fit_transform(specs.T).T


Spec_Train_X = Spec_Data[train_inds].drop('TOTAL', axis=1)
Spec_Val_X = Spec_Data[val_inds].drop('TOTAL', axis=1)
Spec_Test_X = Spec_Data[test_inds].drop('TOTAL', axis=1)


Spec_Train_y = Spec_Data[train_inds]['TOTAL']
Spec_Val_y = Spec_Data[val_inds]['TOTAL']
Spec_Test_y = Spec_Data[test_inds]['TOTAL']

In [100]:
for kern in kernels:
    model = SVC(kernel=kern, gamma='scale')
    model.fit(Spec_Train_X, Spec_Train_y.apply(to_bin))
    val_preds = model.predict(Spec_Val_X)
    print(kern, classification_report(Spec_Val_y.apply(to_bin), val_preds), sep='\n')

linear
              precision    recall  f1-score   support

           0       0.50      0.61      0.55       109
           1       0.77      0.69      0.73       212

    accuracy                           0.66       321
   macro avg       0.64      0.65      0.64       321
weighted avg       0.68      0.66      0.67       321

poly
              precision    recall  f1-score   support

           0       0.54      0.61      0.57       109
           1       0.78      0.74      0.76       212

    accuracy                           0.69       321
   macro avg       0.66      0.67      0.67       321
weighted avg       0.70      0.69      0.70       321

rbf
              precision    recall  f1-score   support

           0       0.58      0.61      0.59       109
           1       0.79      0.78      0.79       212

    accuracy                           0.72       321
   macro avg       0.69      0.69      0.69       321
weighted avg       0.72      0.72      0.72       321

sig

  'precision', 'predicted', average, warn_for)


In [101]:
model = SVC(kernel='poly', gamma='scale')
model.fit(Spec_Train_X, Spec_Train_y.apply(to_bin))
model.fit(Spec_Val_X, Spec_Val_y.apply(to_bin))

preds = model.predict(Spec_Test_X)
print(kern, classification_report(Spec_Test_y.apply(to_bin), preds), sep='\n')

sigmoid
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       208
           1       0.50      1.00      0.66       206

    accuracy                           0.50       414
   macro avg       0.25      0.50      0.33       414
weighted avg       0.25      0.50      0.33       414



  'precision', 'predicted', average, warn_for)


In [102]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(Spec_Train_X, Spec_Train_y.apply(to_bin))
model.fit(Spec_Val_X, Spec_Val_y.apply(to_bin))

preds = model.predict(Spec_Test_X)
print(kern, classification_report(Spec_Test_y.apply(to_bin), preds), sep='\n')

sigmoid
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       208
           1       0.50      1.00      0.66       206

    accuracy                           0.50       414
   macro avg       0.25      0.50      0.33       414
weighted avg       0.25      0.50      0.33       414



  'precision', 'predicted', average, warn_for)


In [103]:
model = LogisticRegression()
model.fit(PCA_Train_X, PCA_Train_y.apply(to_bin))
model.fit(PCA_Val_X, PCA_Val_y.apply(to_bin))

preds = model.predict(PCA_Test_X)
print(kern, classification_report(PCA_Test_y.apply(to_bin), preds), sep='\n')

sigmoid
              precision    recall  f1-score   support

           0       1.00      0.00      0.01       208
           1       0.50      1.00      0.67       206

    accuracy                           0.50       414
   macro avg       0.75      0.50      0.34       414
weighted avg       0.75      0.50      0.34       414





In [113]:
model = LogisticRegression(solver='lbfgs', C=1000)
model.fit(AE_Train_X, AE_Train_y.apply(to_bin))
model.fit(AE_Val_X, AE_Val_y.apply(to_bin))

preds = model.predict(AE_Test_X)
print(kern, classification_report(AE_Test_y.apply(to_bin), preds), sep='\n')

sigmoid
              precision    recall  f1-score   support

           0       0.59      0.30      0.40       208
           1       0.53      0.79      0.63       206

    accuracy                           0.54       414
   macro avg       0.56      0.54      0.51       414
weighted avg       0.56      0.54      0.51       414



