In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.style as style
import warnings
warnings.filterwarnings('ignore')

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

In [3]:
data = pd.read_csv("data_diabetes1.csv")
#data.drop(['id'], axis=1, inplace=True)
df = data.copy()

In [4]:
df.head()

Unnamed: 0,case1_risk_diabet,arthritis,gout,thyroid,chronic_bronchitis,abdominal_pain,gallstones,diabetic_relative,overweight,high_bp,...,creatinine_mg/dL,globulin_g/dL,glucose_mg/dL,potassium_mmol/L,sodium_mmol/L,calcium_mg/dL,triglycerides_mg/dL,uricacid_mg/dL,vigorous_recreational_activities,minutes_sedentary_activity
0,risk_diabet,yes,no,no,no,no,no,yes,yes,yes,...,0.92,2.9,85.0,4.0,141.0,9.2,95.0,5.8,no,300.0
1,not_diabet,,,,,,,,no,no,...,0.81,2.7,94.0,4.4,144.0,9.6,92.0,8.0,no,240.0
2,risk_diabet,yes,no,no,no,no,no,yes,yes,yes,...,0.58,3.2,116.0,4.4,144.0,9.5,72.0,4.5,no,120.0
3,risk_diabet,yes,no,yes,no,no,yes,yes,yes,yes,...,1.32,3.3,96.0,4.1,141.0,9.9,132.0,6.2,no,600.0
4,risk_diabet,no,no,no,no,no,no,no,no,no,...,1.13,3.1,98.0,4.9,140.0,9.4,59.0,4.2,yes,420.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5478 entries, 0 to 5477
Data columns (total 51 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   case1_risk_diabet                 5478 non-null   object 
 1   arthritis                         5195 non-null   object 
 2   gout                              5203 non-null   object 
 3   thyroid                           5195 non-null   object 
 4   chronic_bronchitis                5204 non-null   object 
 5   abdominal_pain                    5205 non-null   object 
 6   gallstones                        5197 non-null   object 
 7   diabetic_relative                 5219 non-null   object 
 8   overweight                        5477 non-null   object 
 9   high_bp                           5478 non-null   object 
 10  high_chol                         5472 non-null   object 
 11  pulse_60                          5218 non-null   float64
 12  systol

In [6]:
df = data[['case1_risk_diabet', 'arthritis', 
       'diabetic_relative', 'overweight', 'high_bp', 'high_chol', 'systolic',
       'bmi', 'arm_circumference', 'WH_ratio',]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5478 entries, 0 to 5477
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   case1_risk_diabet  5478 non-null   object 
 1   arthritis          5195 non-null   object 
 2   diabetic_relative  5219 non-null   object 
 3   overweight         5477 non-null   object 
 4   high_bp            5478 non-null   object 
 5   high_chol          5472 non-null   object 
 6   systolic           4848 non-null   float64
 7   bmi                5379 non-null   float64
 8   arm_circumference  5200 non-null   float64
 9   WH_ratio           5128 non-null   float64
dtypes: float64(4), object(6)
memory usage: 428.1+ KB


In [7]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4318 entries, 0 to 4317
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   case1_risk_diabet  4318 non-null   object 
 1   arthritis          4318 non-null   object 
 2   diabetic_relative  4318 non-null   object 
 3   overweight         4318 non-null   object 
 4   high_bp            4318 non-null   object 
 5   high_chol          4318 non-null   object 
 6   systolic           4318 non-null   float64
 7   bmi                4318 non-null   float64
 8   arm_circumference  4318 non-null   float64
 9   WH_ratio           4318 non-null   float64
dtypes: float64(4), object(6)
memory usage: 337.5+ KB


#### Setting traget and features

In [8]:
random_state=101

In [9]:
df['case1_risk_diabet'].replace({'not_diabet':0,'risk_diabet':1},inplace=True)
df.case1_risk_diabet.value_counts()

1    2568
0    1750
Name: case1_risk_diabet, dtype: int64

In [10]:
target = df['case1_risk_diabet']
features = df.drop(['case1_risk_diabet'], axis=1)

In [11]:
numerical =  df.drop(['case1_risk_diabet'],axis=1).select_dtypes('number')
categorical = df.select_dtypes('object')

In [12]:
categorical.columns.values

array(['arthritis', 'diabetic_relative', 'overweight', 'high_bp',
       'high_chol'], dtype=object)

In [13]:
features = pd.get_dummies(data=features, columns=categorical.columns.values, drop_first=True)

In [14]:
### Default Parameter
from sklearn.model_selection import train_test_split
#### split data
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                    random_state=random_state, stratify=target)

In [15]:
# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

3238 1080 3238 1080


#### Modelling

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score
from sklearn.metrics import roc_auc_score,roc_curve

In [17]:
from sklearn.svm import SVC
svm_model = SVC(kernel = "linear",random_state=random_state).fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('----------------------------------------')
print(classification_report(y_test, y_pred))

[[232 206]
 [100 542]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.70      0.53      0.60       438
           1       0.72      0.84      0.78       642

    accuracy                           0.72      1080
   macro avg       0.71      0.69      0.69      1080
weighted avg       0.71      0.72      0.71      1080



In [18]:
model = SVC(C=0.00005, kernel='linear', gamma=0.1, probability=True).fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('----------------------------------------')
print(classification_report(y_test, y_pred))

[[169 269]
 [ 65 577]]
----------------------------------------
              precision    recall  f1-score   support

           0       0.72      0.39      0.50       438
           1       0.68      0.90      0.78       642

    accuracy                           0.69      1080
   macro avg       0.70      0.64      0.64      1080
weighted avg       0.70      0.69      0.66      1080



In [19]:
prediction_proba = model.predict_proba(X_test)

In [20]:
prediction_proba

array([[0.61720044, 0.38279956],
       [0.66609643, 0.33390357],
       [0.50974403, 0.49025597],
       ...,
       [0.63009916, 0.36990084],
       [0.46582858, 0.53417142],
       [0.5739531 , 0.4260469 ]])

In [21]:
y_test

2973    1
2170    0
1221    1
2470    0
3712    0
       ..
2053    1
1501    1
4187    1
3023    1
1810    1
Name: case1_risk_diabet, Length: 1080, dtype: int64

## Export Model / Deployment Model

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4318 entries, 0 to 4317
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   case1_risk_diabet  4318 non-null   int64  
 1   arthritis          4318 non-null   object 
 2   diabetic_relative  4318 non-null   object 
 3   overweight         4318 non-null   object 
 4   high_bp            4318 non-null   object 
 5   high_chol          4318 non-null   object 
 6   systolic           4318 non-null   float64
 7   bmi                4318 non-null   float64
 8   arm_circumference  4318 non-null   float64
 9   WH_ratio           4318 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 337.5+ KB


In [23]:
df.columns

Index(['case1_risk_diabet', 'arthritis', 'diabetic_relative', 'overweight',
       'high_bp', 'high_chol', 'systolic', 'bmi', 'arm_circumference',
       'WH_ratio'],
      dtype='object')

In [24]:
x_y = df[df.columns.values]

In [25]:
import pickle
import pandas as pd

In [26]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [27]:
filename = 'x_dummies_column.sav'
pickle.dump(features.columns, open(filename , 'wb'))

In [28]:
# filename = 'real_colomn.sav'
# pickle.dump(x_y.drop('Price',axis=1).columns, open(filename , 'wb'))

In [29]:
x_y.arthritis.unique()

array(['yes', 'no'], dtype=object)

In [30]:
x_y['diabetic_relative'].unique()

array(['yes', 'no'], dtype=object)

In [31]:
x_y['overweight'].unique()

array(['yes', 'no'], dtype=object)

In [32]:
x_y['high_bp'].unique()

array(['yes', 'no'], dtype=object)

In [33]:
x_y['high_chol'].unique()

array(['no', 'yes'], dtype=object)

In [34]:
model = pickle.load(open('finalized_model.sav','rb'))
# real_columns = pickle.load(open('real_colomn.sav','rb'))
one_hot_columns = pickle.load(open('x_dummies_column.sav','rb'))

def prediction(data):
    df = pd.DataFrame(data,index=[0])
    df = pd.get_dummies(df)
    df = df.reindex(columns=one_hot_columns, )
    hasil = model.predict(df)
    return round(hasil[0])

In [35]:
df.iloc[1,:]

case1_risk_diabet       1
arthritis              no
diabetic_relative      no
overweight             no
high_bp                no
high_chol             yes
systolic              108
bmi                  21.3
arm_circumference    30.8
WH_ratio             0.95
Name: 1, dtype: object

In [36]:
inputan = {'arthritis': 'yes', 'diabetic_relative': 'yes',
           'overweight': 'yes', 'high_bp': 'yes', 'high_chol': 'no','systolic':108,
          'bmi':31.7,'arm_circumference':32, 'WH_ratio':0.93,}

In [37]:
df = pd.DataFrame(inputan,index=[0])
# one hot encoding (categorical item berubah menjadi numerical item (columns))
df = pd.get_dummies(df)
df = df.reindex(columns=one_hot_columns, fill_value=0)
hasil_prediction = model.predict(df)
print(f'Hasil prediksi : {hasil_prediction[0]}')
probability = model.predict_proba(df)
print(f'Hasil proba : {probability}')

Hasil prediksi : 1
Hasil proba : [[0.5201378 0.4798622]]


In [38]:
probability[0,1]

0.47986219513778594

In [39]:
if probability[0,1] < 0.3:
    print('You are in Low Risk Diabetes')
elif probability[0,1] < 0.6:
    print('You are in Moderate Risk Diabetes')
else:
    print('You are in High Risk Diabetes')

You are in Moderate Risk Diabetes


In [40]:
inputan = {'arthritis': 'yes', 'gout': 'yes','thyroid': 'yes','chronic_bronchitis': 'yes', 
           'abdominal_pain': 'yes', 'gallstones': 'yes', 'diabetic_relative': 'yes',
           'overweight': 'yes', 'high_bp': 'yes', 'high_chol': 'yes',
          'bmi':40,'arm_circumference':32, 'WH_ratio':0.93,
          'albumin':4,'glucose_mg/dL':198, 'triglycerides_mg/dL':100,'uricacid_mg/dL':4.2}

In [41]:
df = pd.DataFrame(inputan,index=[0])
# one hot encoding (categorical item berubah menjadi numerical item (columns))
df = pd.get_dummies(df)
df = df.reindex(columns=one_hot_columns, fill_value=0)
hasil_prediction = model.predict(df)
print(f'Hasil prediksi : {hasil_prediction[0]}')
probability = model.predict_proba(df)
print(f'Hasil proba : {probability}')

Hasil prediksi : 0
Hasil proba : [[0.93601244 0.06398756]]


In [42]:
if probability[0,1] < 0.3:
    print('You are in Low Risk Diabetes')
elif probability[0,1] < 0.6:
    print('You are in Moderate Risk Diabetes')
else:
    print('You are in High Risk Diabetes')

You are in Low Risk Diabetes
