In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529) 
  
# data (as pandas dataframes) 
X = early_stage_diabetes_risk_prediction.data.features 
y = early_stage_diabetes_risk_prediction.data.targets 
  
# metadata 
print(early_stage_diabetes_risk_prediction.metadata) 
  
# variable information 
print(early_stage_diabetes_risk_prediction.variables) 


{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yasmin Bushra', 'published_in': 

In [2]:
X.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes


In [3]:
import pandas as pd

X_copy = X.copy()

for column in X_copy:
    if column!='age' and column!='gender':
        
        dummies = pd.get_dummies(X_copy[column], prefix=column)
            
        X_copy.drop([column], axis='columns', inplace=True)
        
        X_copy = pd.concat([X_copy,dummies],axis='columns')
        
        X_copy.drop([f'{column}_No'], axis='columns', inplace=True)
        
X = X_copy

X.head()

Unnamed: 0,age,gender,polyuria_Yes,polydipsia_Yes,sudden_weight_loss_Yes,weakness_Yes,polyphagia_Yes,genital_thrush_Yes,visual_blurring_Yes,itching_Yes,irritability_Yes,delayed_healing_Yes,partial_paresis_Yes,muscle_stiffness_Yes,alopecia_Yes,obesity_Yes
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1


In [4]:
gender_dummies = pd.get_dummies(X.gender)

updated_X_Gender = pd.concat([X,gender_dummies],axis='columns')

updated_X_Gender.drop(['gender','Male'],axis='columns',inplace=True)

updated_X_Gender.head()

Unnamed: 0,age,polyuria_Yes,polydipsia_Yes,sudden_weight_loss_Yes,weakness_Yes,polyphagia_Yes,genital_thrush_Yes,visual_blurring_Yes,itching_Yes,irritability_Yes,delayed_healing_Yes,partial_paresis_Yes,muscle_stiffness_Yes,alopecia_Yes,obesity_Yes,Female
0,40,0,1,0,1,0,0,0,1,0,1,0,1,1,1,0
1,58,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,41,1,0,0,1,1,0,0,1,0,1,0,1,1,0,0
3,45,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0
4,60,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0


In [5]:
y.head()

Unnamed: 0,class
0,Positive
1,Positive
2,Positive
3,Positive
4,Positive


## use random forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

y_flattened = np.ravel(y)

X_train,X_test,y_train,y_test = train_test_split(updated_X_Gender, y_flattened, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [None]:
from joblib import dump

dump(classifier, '../models/diabetes_risk_prediction.joblib')

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))