In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint
import xlrd

Gołębiewska J, Olechowski A, Wysocka-Mincewicz M, Odrobina D, Baszyńska-Wilk M, Groszek A, et al. (2017) Optical coherence tomography angiography vessel density in children with type 1 diabetes. PLoS ONE 12(10): e0186479. https://doi.org/10.1371/journal.pone.0186479

Link to the dataset:  https://figshare.com/articles/dataset/Optical_coherence_tomography_angiography_vessel_density_in_children_with_type_1_diabetes/5520808


In [2]:
data = pd.read_excel('Gołębiewska2017.xlsx')
data.head()

Unnamed: 0,no,id,gr,group,gender,age,diuration of the disese,age on onset of the disease,HbA1c mean level,microalbuminuria,...,diastolic blood preeasure,faz,wsvd,fsvd,psvd,ft,pft,wdvd,fdvd,pdvd
0,1,107,Cases,0,F,11.726027,1.0,10.726027,7.0,10.4,...,,0.187,57.040001,36.259998,58.830002,263,317,63.970001,32.849998,67.209999
1,2,21,Cases,0,M,17.295891,1.0,16.29315,6.6,12.5,...,60.0,0.301,56.240002,28.780001,58.84,234,324,61.540001,27.57,64.980003
2,3,21,Cases,0,M,17.295891,1.0,16.29315,6.6,12.5,...,60.0,0.303,51.52,30.549999,53.580002,241,328,57.91,27.77,60.91
3,4,163,Cases,0,M,17.306849,1.07,16.232876,8.3,,...,61.0,0.047,55.799999,44.59,56.580002,301,342,60.529999,40.240002,62.41
4,5,163,Cases,0,M,17.306849,1.07,16.232876,8.3,,...,61.0,0.047,55.799999,44.59,56.580002,301,342,60.529999,40.240002,62.41


In [3]:
len(data)

168

In [4]:
data.columns

Index(['no', 'id', 'gr', 'group', 'gender', 'age', 'diuration of the disese',
       'age on onset of the disease', 'HbA1c mean level', 'microalbuminuria',
       'serum creatinine level',
       'creatinine in the daily collection of urine ',
       'systolic blood preasure', 'diastolic blood preeasure', 'faz', 'wsvd',
       'fsvd', 'psvd', 'ft', 'pft', 'wdvd', 'fdvd', 'pdvd'],
      dtype='object')

### There is duplicated values, we can check them in the IDs. 

In [5]:
data.drop_duplicates(subset=['id'], inplace = True)

In [6]:
data.tail()

Unnamed: 0,no,id,gr,group,gender,age,diuration of the disese,age on onset of the disease,HbA1c mean level,microalbuminuria,...,diastolic blood preeasure,faz,wsvd,fsvd,psvd,ft,pft,wdvd,fdvd,pdvd
160,161,224,Controls,1,F,13.0,,,,,...,,0.284,53.700001,36.400002,55.32,247,324,59.18,31.219999,62.91
162,163,225,Controls,1,F,12.0,,,,,...,,0.275,52.25,28.629999,55.189999,224,318,59.709999,31.379999,62.34
164,165,226,Controls,1,M,11.0,,,,,...,,0.054,57.349998,48.57,57.580002,268,322,61.360001,32.869999,63.459999
165,166,227,Controls,1,M,12.0,,,,,...,,0.261,49.27,28.690001,51.84,273,338,58.98,31.950001,62.889999
166,167,228,Controls,1,F,13.0,,,,,...,,0.204,56.400002,34.490002,58.099998,238,307,61.130001,33.119999,63.779999


There is a control group without diabetes. We should drop them. They have gr = cases, or group = 0. We are going to filter them based on group value. 

In [7]:
data1 = data[data.group < 1]

In [8]:
data1.tail()

Unnamed: 0,no,id,gr,group,gender,age,diuration of the disese,age on onset of the disease,HbA1c mean level,microalbuminuria,...,diastolic blood preeasure,faz,wsvd,fsvd,psvd,ft,pft,wdvd,fdvd,pdvd
116,117,71,Cases,0,M,18.002739,11.62,6.380822,8.01,2.44,...,83.0,0.033,50.669998,39.889999,51.380001,243,297,56.959999,41.650002,57.91
118,119,158,Cases,0,M,15.728767,11.75,3.978082,9.3,2.8,...,,,50.700001,43.240002,50.580002,279,315,59.889999,42.759998,61.939999
120,121,126,Cases,0,M,14.432877,12.05,2.380822,7.2,,...,72.0,0.075,56.27,42.669998,58.560001,283,323,61.869999,45.849998,63.73
122,123,112,Cases,0,M,16.301371,13.95,2.353425,7.85,6.52,...,78.0,0.3,53.48,31.35,55.080002,252,333,57.57,27.35,60.610001
124,125,128,Cases,0,F,17.975342,14.36,3.616438,11.6,,...,58.0,0.26,50.060001,27.610001,51.330002,253,316,57.099998,29.01,59.130001


We have to map the gender Female = 0, Male 1

In [9]:
data1.replace({'F' : 0, 'M': 1}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [10]:
data1.drop('gr', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
len(data1)

64

## Now we can fill the NAN values using the iterative imputer

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [13]:
iterative_imp = IterativeImputer()
iterative_imp.fit(data1)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [14]:
data2 = iterative_imp.fit_transform(data1)
data2 = pd.DataFrame(data=data2, columns = data1.columns,)

In [15]:
data2.dtypes

no                                              float64
id                                              float64
group                                           float64
gender                                          float64
age                                             float64
diuration of the disese                         float64
age on onset of the disease                     float64
HbA1c mean level                                float64
microalbuminuria                                float64
serum creatinine level                          float64
creatinine in the daily collection of urine     float64
systolic blood preasure                         float64
diastolic blood preeasure                       float64
faz                                             float64
wsvd                                            float64
fsvd                                            float64
psvd                                            float64
ft                                              

In [16]:
## Random Forest can only takes float32.

In [17]:
data2 = data2.astype(np.float32)

In [18]:
data2.isnull().values.any()

False

## I will have to drop age of onset, as it leaks data

In [19]:
data2.drop(['age on onset of the disease','no', 'id', 'group'], axis=1, inplace = True)

In [36]:
data2

Unnamed: 0,group,gender,age,diuration of the disese,HbA1c mean level,microalbuminuria,serum creatinine level,creatinine in the daily collection of urine,systolic blood preasure,diastolic blood preeasure,faz,wsvd,fsvd,psvd,ft,pft,wdvd,fdvd,pdvd
0,0.0,0.0,11.726027,1.00,7.00,10.400000,0.50,1.113491,110.698364,66.657333,0.187000,57.040001,36.259998,58.830002,263.0,317.0,63.970001,32.849998,67.209999
1,0.0,1.0,17.295891,1.00,6.60,12.500000,0.78,1.720000,125.000000,60.000000,0.301000,56.240002,28.780001,58.840000,234.0,324.0,61.540001,27.570000,64.980003
2,0.0,1.0,17.306849,1.07,8.30,8.913454,0.83,1.432596,111.000000,61.000000,0.047000,55.799999,44.590000,56.580002,301.0,342.0,60.529999,40.240002,62.410000
3,0.0,1.0,17.734247,1.21,6.65,2.800000,0.86,1.900000,126.000000,70.000000,0.130000,53.590000,38.669998,55.040001,266.0,308.0,58.320000,34.990002,60.509998
4,0.0,0.0,14.495891,1.38,9.65,14.600000,0.73,1.000000,110.937111,68.609322,0.180000,55.160000,33.389999,57.049999,254.0,326.0,59.990002,36.049999,62.480000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.0,1.0,18.002739,11.62,8.01,2.440000,1.12,1.710000,127.000000,83.000000,0.033000,50.669998,39.889999,51.380001,243.0,297.0,56.959999,41.650002,57.910000
60,0.0,1.0,15.728767,11.75,9.30,2.800000,0.77,1.430000,112.059776,69.879845,0.037795,50.700001,43.240002,50.580002,279.0,315.0,59.889999,42.759998,61.939999
61,0.0,1.0,14.432877,12.05,7.20,8.902466,0.74,1.584216,118.000000,72.000000,0.075000,56.270000,42.669998,58.560001,283.0,323.0,61.869999,45.849998,63.730000
62,0.0,1.0,16.301371,13.95,7.85,6.520000,0.88,1.570000,113.000000,78.000000,0.300000,53.480000,31.350000,55.080002,252.0,333.0,57.570000,27.350000,60.610001


# Let's predict duration of diabetes

In [20]:
X = data2.drop('diuration of the disese', axis = 1)
y = data2['diuration of the disese']

In [21]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [22]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [23]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

In [24]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [25]:
randomforest = RandomForestRegressor()

In [26]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  randomforest.fit(X_train, y_train)
  r2 = randomforest.score(X_test, y_test)
  y_pred = randomforest.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [27]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [28]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,-0.196587
1,MSE,14.990118
2,RMSE,3.86701
3,SMAPE,0.604632
4,MAE,2.995295


In [29]:
# XGBOOST

In [30]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [31]:
xgb = XGBRegressor()

In [32]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [33]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [34]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [35]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,-0.353176
1,MSE,16.951759
2,RMSE,4.101086
3,SMAPE,0.588523
4,MAE,3.113131
