In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from random import randint

In [20]:
import xlrd

In [21]:
data = pd.read_excel('dataset5.xlsx')

In [22]:
data.head()

Unnamed: 0,Patient's code,Age,Gender,Duration of DM,Smoking,HTN,Anti HTN,DR,Insulin,Sulfonylurea,...,Nausea,Retching,Vomiting,Stomach fullness,Not able to finish a meal,Excessive fullness after meals,Loss of appetitie,Bloating,stomach or belly visibly larger,presenceofanysymptom
0,1,61,F,3.0,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,2,44,M,5.0,No,No,No,No,Yes,Yes,...,No,No,No,No,No,No,No,Yes,Yes,Yes
2,3,61,M,9.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No
3,4,53,M,5.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,Yes,Yes,No,No,No,Yes
4,5,56,F,7.0,No,Yes,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No


In [24]:
data.columns

Index(['Patient's code ', 'Age', 'Gender', 'Duration of DM', 'Smoking', 'HTN',
       'Anti HTN', 'DR', 'Insulin', 'Sulfonylurea', 'Metformin',
       'DDP-4 inhibitor', 'TZD', 'Meglitinides', 'None', 'BMI', 'FBS', 'TC',
       'TG', 'HDL', 'LDL', 'HbA1c', 'Urine ACR', 'UACR new ', 'Albuminuria',
       'eGFR MDRD equation', 'SBP', 'DBP', 'PSBP', 'PDBP', 'PHR',
       'orthostatic hypotension', 'resting tachycardia', 'QTc',
       'QTc prolonged ? ', 'CAN', 'GCSI score', 'GCSI new', 'GCSI present ?',
       'GCSI category', 'Nausea ', 'Retching ', 'Vomiting ',
       'Stomach fullness ', 'Not able to finish a meal',
       'Excessive fullness after meals', 'Loss of appetitie', 'Bloating ',
       'stomach or belly visibly larger ', 'presenceofanysymptom'],
      dtype='object')

In [25]:
data.drop(["Patient's code "], axis = 1, inplace = True)

In [26]:
data.isnull().values.any()

True

## Mapping the strings

In [28]:
data.replace({'No': 0, 'Yes': 1, 'F': 0, 'M': 1}, inplace = True)

In [31]:
data['GCSI present ?'].unique()

array(['absent  ', 'present ', 'present'], dtype=object)

In [32]:
data['GCSI category'].unique()

array(['none', 'mild', 'severe'], dtype=object)

In [33]:
data['QTc prolonged ? '].unique()

array(['borderline', 0, 1], dtype=object)

In [34]:
data['Albuminuria'].unique()

array(['normoalbuminuria', 'macroalbuminuria', 'microalbuminuria'],
      dtype=object)

In [36]:
data.replace({'absent  ': 0, 'present ': 1, 'present': 1, 'none':0, 'mild': 1, 'severe': 2, 'normoalbuminuria': 0, 'macroalbuminuria': 2, 'microalbuminuria': 1, 'borderline': 0.5}, inplace = True)

In [42]:
data['DR'].unique()

array([0, 1, nan, 'has appo', 'No 2012 has appoint'], dtype=object)

### Couldn't find what DR meant

In [43]:
data.replace({'has appo': 1, 'No 2012 has appoint': 0}, inplace = True)

## Filling Missing Values

In [44]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [45]:
iterative_imp = IterativeImputer()

In [47]:
iterative_imp.fit(data)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=None,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [48]:
data1 = iterative_imp.fit_transform(data)

In [49]:
data1 = pd.DataFrame(data=data1, columns= data.columns,)

# Let's predict duration of diabetes

In [50]:
X = data1.drop('Duration of DM', axis = 1)
y = data1['Duration of DM']

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state= 42)

In [53]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [54]:
#Sklearn does't have any function for SMAPE, so I wrote a function in python
#The function has 100%/n. I replaced 100% with 1, to have values between 0 and 1 in form of percentages.
#A is the real, while F is predicted.
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f)))

In [55]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [56]:
randomforest = RandomForestRegressor()

In [57]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.3)
  randomforest.fit(X_train, y_train)
  r2 = randomforest.score(X_test, y_test)
  y_pred = randomforest.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [58]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [59]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.352821
1,MSE,31.820339
2,RMSE,5.639898
3,SMAPE,0.47113
4,MAE,4.706183


# XGBOOST

In [60]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor



In [64]:
xgb = XGBRegressor()

In [65]:
r2mean = []
SMAPEm = []
MSEm = []
RMSEm = []
MAEm= []

In [66]:
for x in range(1000): 
  X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size=0.30)
  xgb.fit(X_train, y_train)
  r2 = xgb.score(X_test, y_test)
  y_pred = xgb.predict(X_test)
  r2mean.append(r2)
  MAE = mean_absolute_error(y_test, y_pred)
  MAEm.append(MAE)
  MSE = mean_squared_error(y_test, y_pred)
  MSEm.append(MSE)
  RMSE = sqrt(mean_squared_error(y_test, y_pred))
  RMSEm.append(RMSE)
  SMAPE = smape(y_test, y_pred)
  SMAPEm.append(SMAPE)

In [67]:
Metrics = {'Metrics Means': ['R2', 'MSE', 'RMSE', 'SMAPE', 'MAE'],
           'Values': [np.mean(r2mean), np.mean(MSEm), np.mean(RMSEm), np.mean(SMAPEm), np.mean(MAE)]
           }

In [68]:
MetricsDF = pd.DataFrame.from_dict(Metrics)
MetricsDF

Unnamed: 0,Metrics Means,Values
0,R2,0.244817
1,MSE,37.130661
2,RMSE,6.088896
3,SMAPE,0.497702
4,MAE,5.087803


## Feature Rankings

In [74]:
import gabrielrfe as rfe

In [75]:
ranking = rfe.RankingRE(X, y, 100)

In [76]:
r2pred = rfe.RankingRE(X, y, 100)

In [77]:
r2pred.ranking_by_r2_punishment()

Unnamed: 0,Categories,average-r2-punishment,ranking,STD_of_r2_punishment
5,DR,0.1207051276829077,1.0,0.0548971832471699
0,Age,0.0391259789525596,2.0,0.0405999619253869
6,Insulin,0.023384661331495,3.0,0.0189564926230986
13,BMI,0.0090937990708833,4.0,0.0156126687635924
27,PDBP,0.0044144826327559,5.0,0.0185387828335946
33,CAN,0.0039376065142643,6.0,0.0121487501309905
7,Sulfonylurea,0.0017610865857618,7.0,0.0117799376529768
35,GCSI new,0.0015384712263278,8.0,0.0096294810427069
9,DDP-4 inhibitor,0.0015084762301908,9.0,0.0109697320157735
37,GCSI category,0.0014491355761156,10.0,0.0106516027164371
