<a href="https://colab.research.google.com/github/fahmi-nugroho/Proyek-Capstone-CSD-101/blob/main/Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [32]:
import pandas as pd
import numpy as np
import joblib
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Data Understanding

## Data Loading

In [2]:
#Download dataset from Kaggle
! pip install kaggle

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download ishandutta/early-stage-diabetes-risk-prediction-dataset

Downloading early-stage-diabetes-risk-prediction-dataset.zip to /content
  0% 0.00/2.52k [00:00<?, ?B/s]
100% 2.52k/2.52k [00:00<00:00, 4.28MB/s]


In [3]:
!unzip /content/early-stage-diabetes-risk-prediction-dataset.zip

Archive:  /content/early-stage-diabetes-risk-prediction-dataset.zip
  inflating: diabetes_data_upload.csv  


In [4]:
diabet_data = pd.read_csv('/content/diabetes_data_upload.csv')
diabet_data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


## Deskripsi Variabel



1. Age : Usia pasien
2. Gender : Jenis kelamin pasien
3. Polyuria : Keseringan buang air kecil
4. Polydipsia : Banyak minum
5. Sudden weight loss : Penurunan berat badan secara drastis
6. weakness : Lemas
7. Polyphagia : Nafsu makan berlebih
8. Genital thrush : Infeksi kelamin
9. Visual blurring : Pandangan kabur
10. Itching : Rasa gatal berlebih
11. Irritability : Perubahan mood
12. Delayed healing : Proses penyembuhan yang lama
13. Partial paresis : Melemahnya gerakan badan
14. Muscle stiffness : Kakunya otot
15. Alopecia : Kerontokan rambut
16. Obesity : Obesitas
17. Class : Kategori pasien

In [5]:
diabet_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

## Mengecek Missing Value

In [6]:
diabet_data.isna().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

# Data Preparation

## Encoding Fitur

In [None]:
diabet_data['Gender'] = diabet_data ['Gender'].replace({'Female':0,'Male':1 })
diabet_data['class'] = diabet_data ['class'].replace({'Negative':0,'Positive':1 })

for column in diabet_data.columns.drop(['Age','Gender','class']):
  diabet_data[column]= diabet_data[column].replace({'No':0 , 'Yes': 1})

diabet_data.head()

## Pembagian Dataset

In [8]:
X = diabet_data.drop(["class"],axis =1)
y = diabet_data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Standarisasi Data

In [9]:
scaler_train = StandardScaler()
scaler_train.fit(X_train)
X_train = scaler_train.transform(X_train)

scaler_val = StandardScaler()
scaler_val.fit(X_test)
X_test = scaler_val.transform(X_test)

# Modelling dan Training

## Logistic Regression

In [10]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

## K-NearstNeighbors

In [11]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

## Randomforest

In [12]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Testing dan Evaluasi Model

In [16]:
print('==================== Logistic Regression ====================')
print('=> Confusion Matrix')
print(confusion_matrix(y_test,y_pred_lr))
print('=> Classification Report')
print(classification_report(y_test,y_pred_lr))
print('=> Accuracy Score')
print(accuracy_score(y_test, y_pred_lr))

print('\n\n\n==================== K-Nearest Neighbor ====================')

print('=> Confusion Matrix')
print(confusion_matrix(y_test,y_pred_knn))
print('=> Classification Report')
print(classification_report(y_test,y_pred_knn))
print('=> Accuracy Score')
print(accuracy_score(y_test, y_pred_knn))

print('\n\n\n==================== Random Forest ====================')
print('=> Confusion Matrix')
print(confusion_matrix(y_test,y_pred_rf))
print('=> Classification Report')
print(classification_report(y_test,y_pred_rf))
print('=> Accuracy Score')
print(accuracy_score(y_test, y_pred_rf))




=> Confusion Matrix
[[40  1]
 [ 4 59]]
=> Classification Report
              precision    recall  f1-score   support

           0       0.91      0.98      0.94        41
           1       0.98      0.94      0.96        63

    accuracy                           0.95       104
   macro avg       0.95      0.96      0.95       104
weighted avg       0.95      0.95      0.95       104

=> Accuracy Score
0.9519230769230769
=> Confusion Matrix
[[40  1]
 [ 8 55]]
=> Classification Report
              precision    recall  f1-score   support

           0       0.83      0.98      0.90        41
           1       0.98      0.87      0.92        63

    accuracy                           0.91       104
   macro avg       0.91      0.92      0.91       104
weighted avg       0.92      0.91      0.91       104

=> Accuracy Score
0.9134615384615384



=> Confusion Matrix
[[40  1]
 [ 4 59]]
=> Classification Report
              precision    recall  f1-score   support

           0       

# Test input data dan Save Model

## Save Model

Karena model random forest menunjukan akurasi yang tinggi, maka kami memilih model tersebut untuk disimpan

In [33]:
joblib.dump(rf, 'randomforest_model.pkl')

['randomforest_model.pkl']

## Test input data

In [34]:
#declare path where you saved your model
filePath = '/content/randomforest_model.pkl'

#open file
file = open(filePath, "rb")

#load the trained model
trained_model = joblib.load(file)

In [35]:
test_data = [21,	1,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0]
test_data = np.array(test_data)
test_data = test_data.reshape(1,-1)

prediksi = trained_model.predict(test_data)
if (prediksi[0] == 0) :
  print('Negative')
else :
  print('Positive')

Negative
