In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint

In [2]:
import xlrd

Link to the dataset: https://figshare.com/articles/dataset/Prevalence_of_cardiovascular_autonomic_neuropathy_and_gastroparesis_symptoms_among_patients_with_type_2_diabetes_who_attend_a_primary_health_care_center/7499969

"AlOlaiwi LA, AlHarbi TJ, Tourkmani AM (2018) Prevalence of
cardiovascular autonomic neuropathy and gastroparesis symptoms among
patients with type 2 diabetes who attend a primary health care center " 

PLoS ONE 13(12): e0209500. https://doi.org/10.1371/journal.pone.0209500

In [3]:
data = pd.read_excel('AlOlaiwi2018_dataset.xlsx')

In [4]:
data.head()

Unnamed: 0,Patient's code,Age,Gender,Duration of DM,Smoking,HTN,Anti HTN,DR,Insulin,Sulfonylurea,...,Nausea,Retching,Vomiting,Stomach fullness,Not able to finish a meal,Excessive fullness after meals,Loss of appetitie,Bloating,stomach or belly visibly larger,presenceofanysymptom
0,1,61,F,3.0,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,2,44,M,5.0,No,No,No,No,Yes,Yes,...,No,No,No,No,No,No,No,Yes,Yes,Yes
2,3,61,M,9.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No
3,4,53,M,5.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,Yes,Yes,No,No,No,Yes
4,5,56,F,7.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No


In [5]:
data.columns

Index(['Patient's code ', 'Age', 'Gender', 'Duration of DM', 'Smoking', 'HTN',
       'Anti HTN', 'DR', 'Insulin', 'Sulfonylurea', 'Metformin',
       'DDP-4 inhibitor', 'TZD', 'Meglitinides', 'None', 'BMI', 'FBS', 'TC',
       'TG', 'HDL', 'LDL', 'HbA1c', 'Urine ACR', 'UACR new ', 'Albuminuria',
       'eGFR MDRD equation', 'SBP', 'DBP', 'PSBP', 'PDBP', 'PHR',
       'orthostatic hypotension', 'resting tachycardia', 'QTc',
       'QTc prolonged ? ', 'CAN', 'GCSI score', 'GCSI new', 'GCSI present ?',
       'GCSI category', 'Nausea ', 'Retching ', 'Vomiting ',
       'Stomach fullness ', 'Not able to finish a meal',
       'Excessive fullness after meals', 'Loss of appetitie', 'Bloating ',
       'stomach or belly visibly larger ', 'presenceofanysymptom'],
      dtype='object')

In [6]:
data.drop(["Patient's code "], axis = 1, inplace = True)

In [7]:
data.isnull().values.any()

True

## Mapping the strings

In [8]:
data.replace({'No': 0, 'Yes': 1, 'F': 0, 'M': 1}, inplace = True)

In [9]:
data['GCSI present ?'].unique()

array(['absent  ', 'present ', 'present'], dtype=object)

In [10]:
data['GCSI category'].unique()

array(['none', 'mild', 'severe'], dtype=object)

In [11]:
data['QTc prolonged ? '].unique()

array(['borderline', 0, 1], dtype=object)

In [12]:
data['Albuminuria'].unique()

array(['normoalbuminuria', 'macroalbuminuria', 'microalbuminuria'],
      dtype=object)

In [13]:
data.replace({'absent  ': 0, 'present ': 1, 'present': 1, 'none':0, 'mild': 1, 'severe': 2, 'normoalbuminuria': 0, 'macroalbuminuria': 2, 'microalbuminuria': 1, 'borderline': 0.5}, inplace = True)

In [14]:
data['DR'].unique()

array([0, 1, nan, 'has appo', 'No 2012 has appoint'], dtype=object)

### Couldn't find what DR meant

In [15]:
data.replace({'has appo': 2, 'No 2012 has appoint': 3}, inplace = True)

## Filling Missing Values

In [16]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [17]:
iterative_imp = IterativeImputer()

In [18]:
iterative_imp.fit(data)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [19]:
data1 = iterative_imp.fit_transform(data)

In [20]:
data1 = pd.DataFrame(data=data1, columns = data.columns,)

# Let's predict duration of diabetes

In [21]:
X = data1.drop('Duration of DM', axis = 1)
y = data1['Duration of DM']

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [24]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [25]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

In [26]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [27]:
randomforest = RandomForestRegressor()

In [28]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  randomforest.fit(X_train, y_train)
  r2 = randomforest.score(X_test, y_test)
  y_pred = randomforest.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [32]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)],
           'Values-STD': [np.std(r2mean), np.std(MSEm),np.std(RMSEm), np.std(SMAPEm), np.std(MAEm)]
           }

In [33]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values,Values-STD
0,R2,0.35212,0.026499
1,MSE,31.854831,1.302917
2,RMSE,5.642836,0.115016
3,SMAPE,0.47064,0.010009
4,MAE,4.602,0.108074


# XGBOOST

In [34]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [35]:
xgb = XGBRegressor()

In [36]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [37]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [39]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)],
           'Values-STD': [np.std(r2mean), np.std(MSEm),np.std(RMSEm), np.std(SMAPEm), np.std(MAEm)]
           }

In [40]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values,Values-STD
0,R2,0.249253,0.060764
1,MSE,36.91254,2.987645
2,RMSE,6.070616,0.245273
3,SMAPE,0.495755,0.021228
4,MAE,4.6694,0.211758


## Linear Regression

In [41]:
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()

In [42]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [43]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  lreg.fit(X_train, y_train)
  r2 = lreg.score(X_test, y_test)
  y_pred = lreg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [44]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAEm)],
           'Values-STD': [np.std(r2mean), np.std(MSEm),np.std(RMSEm), np.std(SMAPEm), np.std(MAEm)]
           }

In [45]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values,Values-STD
0,R2,0.09401,0.074619
1,MSE,44.5455,3.668852
2,RMSE,6.668633,0.273553
3,SMAPE,0.526762,0.023276
4,MAE,5.185859,0.209313


## Decision Trees

In [46]:
from sklearn.tree import DecisionTreeRegressor
DTReg = DecisionTreeRegressor()


In [47]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [48]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  DTReg.fit(X_train, y_train)
  r2 = DTReg.score(X_test, y_test)
  y_pred = DTReg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [49]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAEm)],
           'Values-STD': [np.std(r2mean), np.std(MSEm),np.std(RMSEm), np.std(SMAPEm), np.std(MAEm)]
           }

In [50]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values,Values-STD
0,R2,-0.214182,0.148877
1,MSE,59.698641,7.319946
2,RMSE,7.712133,0.470792
3,SMAPE,0.61228,0.037617
4,MAE,5.98264,0.396737


## Feature Rankings

In [51]:
import gabrielrfe as rfe

In [52]:
ranking = rfe.RankingRE(X, y, 1000)

In [53]:
rank = ranking.ranking_borda()
rank

Unnamed: 0,Categories,Borda-Score,STD,Borda-Average,ranking
5,DR,3907.0,8.45188446442565,3.907,1.0
0,Age,6133.0,10.186231442491364,6.133,2.0
6,Insulin,6843.0,9.626544083937969,6.843,3.0
13,BMI,17256.0,14.731478676629882,17.256,4.0
27,PDBP,18721.0,14.60469647065628,18.721,5.0
33,CAN,22524.0,13.719745770239308,22.524,6.0
7,Sulfonylurea,23417.0,14.038344311207066,23.417,7.0
17,HDL,23645.0,13.851822082311056,23.645,8.0
14,FBS,23927.0,13.91027213968151,23.927,9.0
18,LDL,24437.0,13.546218328374913,24.437,10.0
