In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint

In [25]:
import xlrd

Link to the dataset: https://figshare.com/articles/dataset/Prevalence_of_cardiovascular_autonomic_neuropathy_and_gastroparesis_symptoms_among_patients_with_type_2_diabetes_who_attend_a_primary_health_care_center/7499969

"AlOlaiwi LA, AlHarbi TJ, Tourkmani AM (2018) Prevalence of
cardiovascular autonomic neuropathy and gastroparesis symptoms among
patients with type 2 diabetes who attend a primary health care center " 

PLoS ONE 13(12): e0209500. https://doi.org/10.1371/journal.pone.0209500

In [26]:
data = pd.read_excel('AlOlaiwi2018_dataset.xlsx')

In [27]:
data.head()

Unnamed: 0,Patient's code,Age,Gender,Duration of DM,Smoking,HTN,Anti HTN,DR,Insulin,Sulfonylurea,...,Nausea,Retching,Vomiting,Stomach fullness,Not able to finish a meal,Excessive fullness after meals,Loss of appetitie,Bloating,stomach or belly visibly larger,presenceofanysymptom
0,1,61,F,3.0,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,2,44,M,5.0,No,No,No,No,Yes,Yes,...,No,No,No,No,No,No,No,Yes,Yes,Yes
2,3,61,M,9.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No
3,4,53,M,5.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,Yes,Yes,No,No,No,Yes
4,5,56,F,7.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No


In [28]:
data.columns

Index(['Patient's code ', 'Age', 'Gender', 'Duration of DM', 'Smoking', 'HTN',
       'Anti HTN', 'DR', 'Insulin', 'Sulfonylurea', 'Metformin',
       'DDP-4 inhibitor', 'TZD', 'Meglitinides', 'None', 'BMI', 'FBS', 'TC',
       'TG', 'HDL', 'LDL', 'HbA1c', 'Urine ACR', 'UACR new ', 'Albuminuria',
       'eGFR MDRD equation', 'SBP', 'DBP', 'PSBP', 'PDBP', 'PHR',
       'orthostatic hypotension', 'resting tachycardia', 'QTc',
       'QTc prolonged ? ', 'CAN', 'GCSI score', 'GCSI new', 'GCSI present ?',
       'GCSI category', 'Nausea ', 'Retching ', 'Vomiting ',
       'Stomach fullness ', 'Not able to finish a meal',
       'Excessive fullness after meals', 'Loss of appetitie', 'Bloating ',
       'stomach or belly visibly larger ', 'presenceofanysymptom'],
      dtype='object')

In [29]:
data.drop(["Patient's code "], axis = 1, inplace = True)

In [30]:
data.isnull().values.any()

True

## Mapping the strings

In [31]:
data.replace({'No': 0, 'Yes': 1, 'F': 0, 'M': 1}, inplace = True)

In [32]:
data['GCSI present ?'].unique()

array(['absent  ', 'present ', 'present'], dtype=object)

In [33]:
data['GCSI category'].unique()

array(['none', 'mild', 'severe'], dtype=object)

In [34]:
data['QTc prolonged ? '].unique()

array(['borderline', 0, 1], dtype=object)

In [35]:
data['Albuminuria'].unique()

array(['normoalbuminuria', 'macroalbuminuria', 'microalbuminuria'],
      dtype=object)

In [36]:
data.replace({'absent  ': 0, 'present ': 1, 'present': 1, 'none':0, 'mild': 1, 'severe': 2, 'normoalbuminuria': 0, 'macroalbuminuria': 2, 'microalbuminuria': 1, 'borderline': 0.5}, inplace = True)

In [37]:
data['DR'].unique()

array([0, 1, nan, 'has appo', 'No 2012 has appoint'], dtype=object)

### Couldn't find what DR meant

In [38]:
data.replace({'has appo': 2, 'No 2012 has appoint': 3}, inplace = True)

## Filling Missing Values

In [39]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [40]:
iterative_imp = IterativeImputer()

In [41]:
iterative_imp.fit(data)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [42]:
data1 = iterative_imp.fit_transform(data)

In [43]:
data1 = pd.DataFrame(data=data1, columns = data.columns,)

# Let's predict duration of diabetes

In [44]:
X = data1.drop('Duration of DM', axis = 1)
y = data1['Duration of DM']

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [47]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [48]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

In [49]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [50]:
randomforest = RandomForestRegressor()

In [51]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  randomforest.fit(X_train, y_train)
  r2 = randomforest.score(X_test, y_test)
  y_pred = randomforest.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [52]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [53]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.352158
1,MSE,31.852927
2,RMSE,5.642659
3,SMAPE,0.470641
4,MAE,4.481392


# XGBOOST

In [54]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [55]:
xgb = XGBRegressor()

In [56]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [57]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [58]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [59]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.248419
1,MSE,36.953563
2,RMSE,6.074621
3,SMAPE,0.495713
4,MAE,5.374773


## Feature Rankings

In [60]:
import gabrielrfe as rfe

In [61]:
ranking = rfe.RankingRE(X, y, 100)

In [64]:
rank = ranking.ranking_borda()
rank

Unnamed: 0,Categories,Borda-Score,STD,Borda-Average,ranking
6,Insulin,491.0,7.169511838333209,4.91,1.5
5,DR,491.0,10.438481690360907,4.91,1.5
0,Age,928.0,14.307396688426584,9.28,3.0
13,BMI,1607.0,13.651560350377537,16.07,4.0
27,PDBP,1827.0,15.043174531992909,18.27,5.0
33,CAN,2132.0,14.05622993551258,21.32,6.0
25,DBP,2216.0,13.662884029369494,22.16,7.0
30,resting tachycardia,2309.0,11.154456508499193,23.09,8.0
17,HDL,2338.0,14.453221094275143,23.38,9.0
28,PHR,2351.0,13.380205529064192,23.51,10.0
