# Data Understanding dan Data Preparation

In [2]:
#import library
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#for encoding & scaling
from sklearn.preprocessing import OneHotEncoder

#for train test splitting & Gridsearch
from sklearn.model_selection import train_test_split, GridSearchCV

# for resampling
from imblearn.over_sampling import SMOTE

#random forest
from sklearn.ensemble import RandomForestClassifier

#for checking testing results
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

## Membaca Dataset

In [3]:
df=pd.read_excel('/content/Dataset for People for their Blood Glucose Level with their Superficial body feature readings. (1).xlsx')
df

Unnamed: 0,Age,Blood Glucose Level(BGL),Diastolic Blood Pressure,Systolic Blood Pressure,Heart Rate,Body Temperature,SPO2,Sweating (Y/N),Shivering (Y/N),Class
0,9,79,73,118,98,98.300707,99,0,0,N
1,9,80,73,119,102,98.300707,94,1,0,N
2,9,70,76,110,81,98.300707,98,1,0,N
3,9,70,78,115,96,98.300707,96,1,0,N
4,66,100,96,144,92,97.807052,98,0,0,N
...,...,...,...,...,...,...,...,...,...,...
16964,9,83,87,127,90,96.842657,97,0,0,D
16965,9,83,79,117,80,97.869454,98,0,0,D
16966,9,73,82,116,93,96.766282,98,0,0,D
16967,9,74,86,128,91,98.941036,98,0,0,D


## Data Prepocessing: Missing Values

### Handling Data Duplikat

In [4]:
# handle duplicate data
df=df.drop_duplicates(keep='first')
df.duplicated().sum()

0

### Imbalance Data

In [5]:
X = df.drop('Class', axis=1)
y = df['Class']

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X,y)

In [7]:
y_smote.value_counts()

Class
N    16382
D    16382
Name: count, dtype: int64

In [8]:
df=pd.concat([X_smote, y_smote], axis=1)
df

Unnamed: 0,Age,Blood Glucose Level(BGL),Diastolic Blood Pressure,Systolic Blood Pressure,Heart Rate,Body Temperature,SPO2,Sweating (Y/N),Shivering (Y/N),Class
0,9,79,73,118,98,98.300707,99,0,0,N
1,9,80,73,119,102,98.300707,94,1,0,N
2,9,70,76,110,81,98.300707,98,1,0,N
3,9,70,78,115,96,98.300707,96,1,0,N
4,66,100,96,144,92,97.807052,98,0,0,N
...,...,...,...,...,...,...,...,...,...,...
32759,9,76,70,119,84,98.285436,93,0,0,N
32760,31,77,79,125,88,97.905879,97,0,0,N
32761,9,72,76,117,78,98.168688,96,0,0,N
32762,30,81,82,141,91,98.020886,98,1,0,N


# Encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder

# Membuat objek OneHotEncoder
onehotencoder = OneHotEncoder()
# Menggunakan OneHotEncoder untuk melakukan encoding pada kolom 'Class'
encoded_data = onehotencoder.fit_transform(df[['Class']])

# Mengonversi matriks sparse menjadi DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data, columns=onehotencoder.get_feature_names_out(['Class']))

In [10]:
encoded_df

Unnamed: 0,Class_D,Class_N
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
32759,0.0,1.0
32760,0.0,1.0
32761,0.0,1.0
32762,0.0,1.0


In [11]:
df.reset_index(drop=True, inplace=True)
# Menetapkan ulang indeks pada DataFrame hasil encoding
encoded_df.reset_index(drop=True, inplace=True)

# Menggabungkan hasil encoding ke dalam DataFrame asli
data_baru = pd.concat([df, encoded_df], axis=1)

In [12]:
data_baru.drop('Class', axis=1, inplace=True)

In [13]:
data_baru

Unnamed: 0,Age,Blood Glucose Level(BGL),Diastolic Blood Pressure,Systolic Blood Pressure,Heart Rate,Body Temperature,SPO2,Sweating (Y/N),Shivering (Y/N),Class_D,Class_N
0,9,79,73,118,98,98.300707,99,0,0,0.0,1.0
1,9,80,73,119,102,98.300707,94,1,0,0.0,1.0
2,9,70,76,110,81,98.300707,98,1,0,0.0,1.0
3,9,70,78,115,96,98.300707,96,1,0,0.0,1.0
4,66,100,96,144,92,97.807052,98,0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
32759,9,76,70,119,84,98.285436,93,0,0,0.0,1.0
32760,31,77,79,125,88,97.905879,97,0,0,0.0,1.0
32761,9,72,76,117,78,98.168688,96,0,0,0.0,1.0
32762,30,81,82,141,91,98.020886,98,1,0,0.0,1.0


# Features Scaling

In [14]:
from sklearn.preprocessing import RobustScaler
# Define columns to scale
columns_to_scale = ['Age', 'Blood Glucose Level(BGL)', 'Diastolic Blood Pressure',
                    'Systolic Blood Pressure', 'Heart Rate', 'Body Temperature', 'SPO2']

# Fit and transform scaler
scaler = RobustScaler()
data_baru[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [15]:
data_baru

Unnamed: 0,Age,Blood Glucose Level(BGL),Diastolic Blood Pressure,Systolic Blood Pressure,Heart Rate,Body Temperature,SPO2,Sweating (Y/N),Shivering (Y/N),Class_D,Class_N
0,-0.567568,-0.037037,-0.307692,-0.230769,0.615385,0.488889,1.0,0,0,0.0,1.0
1,-0.567568,0.000000,-0.307692,-0.153846,0.923077,0.488889,-1.5,1,0,0.0,1.0
2,-0.567568,-0.370370,-0.076923,-0.846154,-0.692308,0.488889,0.5,1,0,0.0,1.0
3,-0.567568,-0.370370,0.076923,-0.461538,0.461538,0.488889,-0.5,1,0,0.0,1.0
4,0.972973,0.740741,1.461538,1.769231,0.153846,-0.070339,0.5,0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
32759,-0.567568,-0.148148,-0.538462,-0.153846,-0.461538,0.471590,-2.0,0,0,0.0,1.0
32760,0.027027,-0.111111,0.153846,0.307692,-0.153846,0.041616,0.0,0,0,0.0,1.0
32761,-0.567568,-0.296296,-0.076923,-0.307692,-0.923077,0.339334,-0.5,0,0,0.0,1.0
32762,0.000000,0.037037,0.384615,1.538462,0.076923,0.171899,0.5,1,0,0.0,1.0


In [16]:
import pickle
# Save the scaler
with open('robust(rf).pkl', 'wb') as file:
    pickle.dump(scaler, file)

### Splitting Data

In [17]:
# Mengubah tipe data kolom 'Class_D' dan 'Class_N' menjadi integer
data_baru['Class_D'] = data_baru['Class_D'].to_numpy(dtype='int64')
data_baru['Class_N'] = data_baru['Class_N'].to_numpy(dtype='int64')

X = data_baru.drop(columns=['Class_D', 'Class_N'])
y = data_baru[['Class_D', 'Class_N']]

y.value_counts()

Class_D  Class_N
0        1          16382
1        0          16382
Name: count, dtype: int64

In [18]:
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.1, random_state=0)

In [19]:
print('Persentase target di training:')
print(y_train.value_counts(normalize=True))
print('Persentase target di validation:')
print(y_val.value_counts(normalize=True))
print('Persentase target di testing:')
print(y_test.value_counts(normalize=True))

Persentase target di training:
Class_D  Class_N
1        0          0.500603
0        1          0.499397
Name: proportion, dtype: float64
Persentase target di validation:
Class_D  Class_N
0        1          0.500509
1        0          0.499491
Name: proportion, dtype: float64
Persentase target di testing:
Class_D  Class_N
0        1          0.504425
1        0          0.495575
Name: proportion, dtype: float64


In [20]:
print("Jumlah data training:", len(x_train))
print("Jumlah data validasi:", len(x_val))
print("Jumlah data test:", len(x_test))

Jumlah data training: 26538
Jumlah data validasi: 2949
Jumlah data test: 3277


### Klasifikasi

### Membangun klasifikasi model dengan Random Forest

In [21]:
# Inisialisasi model Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model pada data training
rf_classifier.fit(x_train, y_train)

In [22]:
# Evaluasi pada data validation
y_val_pred = rf_classifier.predict(x_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')

# Evaluasi pada data test
y_test_pred = rf_classifier.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy}')

Validation Accuracy: 0.9986436080027128
Test Accuracy: 0.9978638999084528


In [24]:
import pickle

In [27]:
filename='diabetes_model(rf).sav'
pickle.dump(rf_classifier,open(filename,'wb'))