In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import joblib

In [20]:
url = 'https://raw.githubusercontent.com/dandia14/project-capstone-team-csd-134/ml/dataset/diabetes.csv'
diabetes = pd.read_csv(url)
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols:
    diabetes[col].replace(0,np.NaN,inplace=True)

In [22]:
# Mengganti nilai NaN dengan nilai median sesuai target Outcome
for col in diabetes.columns:
    diabetes.loc[(diabetes["Outcome"]==0) & (diabetes[col].isnull()),col] = diabetes[diabetes["Outcome"]==0][col].median()
    diabetes.loc[(diabetes["Outcome"]==1) & (diabetes[col].isnull()),col] = diabetes[diabetes["Outcome"]==1][col].median()

In [23]:
X = diabetes.drop(['Outcome'], axis=1)
y = diabetes['Outcome']

In [24]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=False)

In [26]:
rfc = RandomForestClassifier()
model = rfc.fit(X_train, y_train)

In [40]:
mse_train = mean_squared_error(y_pred=rfc.predict(X_train), y_true=y_train)
print('MSE Train :', mse_train)
mse_test = mean_squared_error(y_pred=rfc.predict(X_test), y_true=y_test)
print('MSE Test :', mse_test)

MSE Train : 0.0
MSE Test : 0.09740259740259741


In [38]:
accuracy_train = accuracy_score(y_pred=rfc.predict(X_train), y_true=y_train)
print('Accuracy Score Train :', accuracy_train)
accuracy_test = accuracy_score(y_pred=rfc.predict(X_test), y_true=y_test)
print('Accuracy Score Test :', accuracy_test)

Accuracy Score Train : 1.0
Accuracy Score Test : 0.9025974025974026


In [29]:
joblib.dump(model, 'model_rfc.pkl')

['model_rfc.pkl']

In [30]:
file = open('./model_rfc.pkl', 'rb')
model = joblib.load(file)

In [31]:
prediksi = X_test[:5].copy()

In [32]:
predict = model.predict(prediksi)

In [33]:
print(predict)

[1 0 0 0 1]


In [34]:
test1 = np.array([11, 138, 74, 26, 144.5, 36.1, 0.557, 50])
test2 = np.array([2, 68, 62, 13, 15, 20.1, 0.257, 23])

In [35]:
ss_test1 = scaler.fit_transform(test1.reshape(-1, 1))
ss_test2 = scaler.fit_transform(test2.reshape(-1, 1))

In [36]:
result1 = model.predict(ss_test1.transpose())
result2 = model.predict(ss_test2.transpose())
print(result1)
print(result2)

[1]
[0]
