In [9]:
import matplotlib;matplotlib.rcParams['figure.figsize'] = (8,5)
import matplotlib;matplotlib.rcParams['text.usetex'] = True
import matplotlib;matplotlib.rcParams['font.size'] = 16
import matplotlib;matplotlib.rcParams['font.family'] = 'serif'
import GPy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

Generate structured, stationary time series data (**TO DO**)

Generate unstructured, nonstationary time series data (**TO DO**)

Analysis of [UCI Chronic Kidney Disease dataset](https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease) (GP Classification)

In [19]:
ckd_data = pd.read_csv('chronic_kidney_disease_for_DSA_9.csv',header=0)
print(ckd_data.shape)
print(ckd_data.loc[ckd_data['class'] == 1].shape)
print(ckd_data.loc[ckd_data['class'] == 0].shape)

(400, 10)
(249, 10)
(151, 10)


For the sake of this tutorial, we'll only deal with 'complete' data (rows with no missing entries). Data missingness is a common issue (esp. in the healthcare domain).

In [16]:
missing_row_idx = pd.Series.any(pd.isna(ckd_data),1)
ckd_data_no_missing = ckd_data.iloc[np.where(missing_row_idx.values.flatten() != True)[0],:]

There are 9 feature columns and 1 label column.

The feature columns are as follows:  
| **Feature**  | **Description**  |  
|---|---|  
| age  | Age of patient  |  
| bp  | Blood pressure  |  
| bgr  | Blood glucose (random) |  
| bu | Blood urea |  
| sc | Serum creatinine |  
| hemo | Hemoglobin |  
| pcv | Packed cell volume |  
| wbcc | White blood cell count |  
| rbcc | Red blood cell count |  
    
The label column is called 'class' and takes values '1' (for the presence of chronic kidney disease) or '0' (for its absence).

In [20]:
print(ckd_data_no_missing)
print(ckd_data_no_missing.shape)
print(ckd_data_no_missing.loc[ckd_data_no_missing['class'] == 1].shape)
print(ckd_data_no_missing.loc[ckd_data_no_missing['class'] == 0].shape)

      age     bp    bgr     bu     sc  hemo   pcv     wbcc  rbcc  class
0    48.0   80.0  121.0   36.0   1.20  15.4  44.0   7800.0   5.2      1
3    48.0   70.0  117.0   56.0   3.80  11.2  32.0   6700.0   3.9      1
4    51.0   80.0  106.0   26.0   1.40  11.6  35.0   7300.0   4.6      1
5    60.0   90.0   74.0   25.0   1.10  12.2  39.0   7800.0   4.4      1
8    52.0  100.0  138.0   60.0   1.90  10.8  33.0   9600.0   4.0      1
9    53.0   90.0   70.0  107.0   7.20   9.5  29.0  12100.0   3.7      1
11   63.0   70.0  380.0   60.0   2.70  10.8  32.0   4500.0   3.8      1
12   68.0   70.0  208.0   72.0   2.10   9.7  28.0  12200.0   3.4      1
14   68.0   80.0  157.0   90.0   4.10   5.6  16.0  11000.0   2.6      1
15   40.0   80.0   76.0  162.0   9.60   7.6  24.0   3800.0   2.8      1
18   60.0  100.0  263.0   27.0   1.30  12.7  37.0  11400.0   4.3      1
19   62.0   60.0  100.0   31.0   1.60  10.3  30.0   5300.0   3.7      1
20   61.0   80.0  173.0  148.0   3.90   7.7  24.0   9200.0   3.2

Now, we'll standardize the feature columns.

In [28]:
ckd_features = ckd_data_no_missing.iloc[:,0:9]
ckd_features = (ckd_features - ckd_features.mean())/ckd_features.std()
print(ckd_features.mean())
print(ckd_features.std())

age     2.314136e-16
bp     -8.265774e-16
bgr     3.864700e-18
bu     -1.311656e-17
sc      7.940203e-17
hemo   -1.822265e-16
pcv    -5.733810e-16
wbcc   -2.103334e-16
rbcc   -1.395977e-16
dtype: float64
age     1.0
bp      1.0
bgr     1.0
bu      1.0
sc      1.0
hemo    1.0
pcv     1.0
wbcc    1.0
rbcc    1.0
dtype: float64


In [33]:
ckd_labels = ckd_data_no_missing.iloc[:,9]
print(ckd_labels.values)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [34]:
kern = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
lik = GPy.likelihoods.Bernoulli()
m = GPy.core.GP(X=ckd_features.values,
                Y=ckd_labels.values,
                kernel=kern,
                inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(),
                likelihood=lik)
print(m)

AssertionError: 

In [35]:
ckd_labels.values.ndim

1

In [None]:
ckd_labels.values.expand_dims()