In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint

In [5]:
import xlrd

In [6]:
data = pd.read_excel('pone.0216416.s003.xlsx')

In [7]:
data = data.drop(['no'], axis=1)

Filling missing values

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [9]:
iterative_imp = IterativeImputer()

In [10]:
iterative_imp.fit(data)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [11]:
data1 = iterative_imp.fit_transform(data)

In [12]:
data1 = pd.DataFrame(data=data1, columns= data.columns,)

In [13]:
data1.columns = ['sex', 'age', 'duration_of_diabetes', 'BMI',
       'insulin_regimen', 'TDD', 'TDD/kg', 'basal', 'basal/kg',
       'bolus', 'bolus/kg', 'HbA1c', 'eGFR', 'bodyfat', 'adiponectin',
       'free-test', 'SMI', 'grip_strength', 'knee_extension_strength',
       'gait_speed', 'ucOC', 'OC']

# ADDED_WEIGHT


In [14]:
data1['TDDkg'] = data1['TDD/kg']

In [15]:
data1['ADDED_WEIGHT'] = data1.TDD / data1.TDDkg

In [16]:
data1 = data1.drop(['basal/kg', 'bolus/kg', 'TDD/kg', 'TDDkg'], axis = 1)

# Feature Ranking


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = data1.drop('duration_of_diabetes', axis = 1)
y = data1['duration_of_diabetes']

In [19]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [17]:
import gabrielrfe as rfe

In [18]:
ranking = rfe.RankingRE(X, y, 1000)

In [19]:
rank = ranking.ranking_borda()

In [20]:
rank

Unnamed: 0,Categories,Borda-Score,STD,Borda-Average,ranking
1,age,1275.0,1.719120414630682,1.275,1.0
6,bolus,6955.0,5.488986700657982,6.955,2.0
15,gait_speed,7537.0,5.657086794455263,7.537,3.0
8,eGFR,8244.0,5.683701610746287,8.244,4.0
4,TDD,9683.0,5.467770203657051,9.683,5.0
13,grip_strength,10517.0,5.116220382274403,10.517,6.0
2,BMI,10578.0,5.319766536230698,10.578,7.0
10,adiponectin,10991.0,5.019852487872504,10.991,8.0
5,basal,11025.0,4.959473258320873,11.025,9.0
7,HbA1c,11063.0,4.906019873583884,11.063,10.0


In [21]:
r2pred = rfe.RankingRE(X, y, 1000)

In [22]:
r2pred.ranking_by_r2_punishment()

Unnamed: 0,Categories,average-r2-punishment,ranking
1,age,0.6050498831738087,1.0
15,gait_speed,0.0122568422841816,2.0
6,bolus,0.0116008741090541,3.0
8,eGFR,0.007450566629927,4.0
4,TDD,0.0024093836800293,5.0
0,sex,0.0014564142022818,6.0
2,BMI,0.0007668596046008,7.0
10,adiponectin,-0.0133263877168076,8.0
7,HbA1c,-0.0082472716757027,9.0
14,knee_extension_strength,-0.0060693493757142,10.0


# Predictions using top 5 columns

We are going to use the X and Y we splitted from before, so by making a rank classification we don't actually scoop into our data.

In [26]:
X = X[['age', 'bolus', 'gait_speed', 'eGFR', 'TDD']]

In [27]:
X_test = X_test[['age', 'bolus', 'gait_speed', 'eGFR', 'TDD']]

In [25]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [24]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

In [52]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [53]:
rf = RandomForestRegressor()

In [54]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  randomforest.fit(X_train, y_train)
  r2 = randomforest.score(X_test, y_test)
  y_pred = randomforest.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [55]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [56]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [57]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.402624
1,MSE,36.574498
2,RMSE,6.041002
3,SMAPE,0.220553
4,MAE,5.005909


# XGboost for regression:

In [38]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [39]:
xgb = XGBRegressor()

In [40]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [41]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [42]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [43]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [44]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.334962
1,MSE,40.717077
2,RMSE,6.335997
3,SMAPE,0.221515
4,MAE,4.94931


# Linear Regression

In [45]:
from sklearn.linear_model import LinearRegression

In [46]:
lreg = LinearRegression()

In [47]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [48]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  lreg.fit(X_train, y_train)
  r2 = lreg.score(X_test, y_test)
  y_pred = lreg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [49]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [50]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [51]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.4462
1,MSE,33.906502
2,RMSE,5.78601
3,SMAPE,0.245772
4,MAE,4.71787


# Decision Trees

In [1]:
from sklearn.tree import DecisionTreeRegressor

In [3]:
DTReg = DecisionTreeRegressor()

In [20]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [28]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  DTReg.fit(X_train, y_train)
  r2 = DTReg.score(X_test, y_test)
  y_pred = DTReg.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [29]:
R2mean=np.mean(r2mean)
MSEmean=np.mean(MSEm)
RMSEmean=np.mean(RMSEm)
SMAPEmean= np.mean(SMAPEm)
MAEmean= np.mean(MAE)

In [30]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [31]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.137045
1,MSE,52.839682
2,RMSE,7.214156
3,SMAPE,0.264754
4,MAE,5.863636
