In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint

In [2]:
import xlrd

In [3]:
data = pd.read_excel('pone.0216416.s003.xlsx')

In [4]:
data = data.drop(['no'], axis=1)

Filling missing values

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [6]:
iterative_imp = IterativeImputer()

In [7]:
iterative_imp.fit(data)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [8]:
data1 = iterative_imp.fit_transform(data)

In [9]:
data1 = pd.DataFrame(data=data1, columns= data.columns,)

In [10]:
data1.columns = ['sex', 'age', 'duration_of_diabetes', 'BMI',
       'insulin_regimen', 'TDD', 'TDD/kg', 'basal', 'basal/kg',
       'bolus', 'bolus/kg', 'HbA1c', 'eGFR', 'bodyfat', 'adiponectin',
       'free-test', 'SMI', 'grip_strength', 'knee_extension_strength',
       'gait_speed', 'ucOC', 'OC']

# ADDED_WEIGHT


In [11]:
data1['TDDkg'] = data1['TDD/kg']

In [12]:
data1['ADDED_WEIGHT'] = data1.TDD / data1.TDDkg

In [13]:
data1 = data1.drop(['basal/kg', 'bolus/kg', 'TDD/kg', 'TDDkg'], axis = 1)

# Feature Ranking


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = data1.drop('duration_of_diabetes', axis = 1)
y = data1['duration_of_diabetes']

In [16]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [17]:
import gabrielrfe as rfe

In [18]:
ranking = rfe.RankingRE(X, y, 1000)

In [19]:
rank = ranking.ranking_borda()

In [20]:
rank

Unnamed: 0,Categories,Borda-Score,STD,Borda-Average,ranking
1,age,1275.0,1.719120414630682,1.275,1.0
6,bolus,6955.0,5.488986700657982,6.955,2.0
15,gait_speed,7537.0,5.657086794455263,7.537,3.0
8,eGFR,8244.0,5.683701610746287,8.244,4.0
4,TDD,9683.0,5.467770203657051,9.683,5.0
13,grip_strength,10517.0,5.116220382274403,10.517,6.0
2,BMI,10578.0,5.319766536230698,10.578,7.0
10,adiponectin,10991.0,5.019852487872504,10.991,8.0
5,basal,11025.0,4.959473258320873,11.025,9.0
7,HbA1c,11063.0,4.906019873583884,11.063,10.0


In [20]:
r2pred = rfe.RankingRE(X, y, 100)

In [21]:
r2pred.ranking_by_r2_punishment()

Unnamed: 0,Categories,average-r2-punishment,ranking,STD_of_r2_punishment
1,age,0.5913682258962148,1.0,0.2743791485378115
6,bolus,0.0157185042402042,2.0,0.0371061719889058
15,gait_speed,0.0100444762327077,3.0,0.0409915180861675
8,eGFR,0.0031923912806516,4.0,0.0266351773723709
4,TDD,0.0027777445372786,5.0,0.0264665119561789
2,BMI,0.0014018804183062,6.0,0.0276494843382332
10,adiponectin,-0.0148415554164112,7.0,0.0305465039625678
7,HbA1c,-0.0085550305245958,8.0,0.0277467620353374
9,bodyfat,-0.0067423488907343,9.0,0.0252964976057463
14,knee_extension_strength,-0.0055477683091331,10.0,0.0225808822297864


# Predictions using all columns

We are going to use the X and Y we splitted from before, so by making a rank classification we don't actually scoop into our data.

In [26]:
# X = X[['age', 'bolus', 'gait_speed', 'eGFR', 'TDD']]

In [27]:
# X_test = X_test[['age', 'bolus', 'gait_speed', 'eGFR', 'TDD']]

In [23]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [24]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

# Random Forest

In [55]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [56]:
rf = RandomForestRegressor()

In [57]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  rf.fit(X_train, y_train)
  r2 = rf.score(X_test, y_test)
  y_pred = rf.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [58]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [59]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [60]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.41624
1,MSE,35.740843
2,RMSE,5.971627
3,SMAPE,0.219638
4,MAE,5.230455


# XGboost for regression:

In [34]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [35]:
xgb = XGBRegressor()

In [36]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [37]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [38]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [39]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [40]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.394306
1,MSE,37.083711
2,RMSE,6.051147
3,SMAPE,0.214097
4,MAE,5.060748


# Linear Regression

In [41]:
from sklearn.linear_model import LinearRegression

In [42]:
lreg = LinearRegression()

In [43]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [44]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  lreg.fit(X_train, y_train)
  r2 = lreg.score(X_test, y_test)
  y_pred = lreg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [45]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [46]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [47]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.117369
1,MSE,54.039236
2,RMSE,7.097074
3,SMAPE,0.281684
4,MAE,4.325678


# Decision Trees

In [48]:
from sklearn.tree import DecisionTreeRegressor

In [49]:
DTReg = DecisionTreeRegressor()

In [50]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [51]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  DTReg.fit(X_train, y_train)
  r2 = DTReg.score(X_test, y_test)
  y_pred = DTReg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [52]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [53]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [54]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.047813
1,MSE,58.297864
2,RMSE,7.56916
3,SMAPE,0.269048
4,MAE,5.681818
