In [1]:
import pandas as pd 

In [2]:
df = pd.read_excel("diabetes_data.xlsx")
df.head()


Unnamed: 0,Age,Gender,ExcessUrination,Polydipsia,WeightLossSudden,Fatigue,Polyphagia,GenitalThrush,BlurredVision,Itching,Irritability,DelayHealing,PartialPsoriasis,MuscleStiffness,Alopecia,Obesity,DiabeticClass
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, explorative= True)
# profile.to_file("diabetes_data_practice.html")

In [4]:
y = df[['DiabeticClass', 'Obesity']]
x = df.drop(['DiabeticClass', 'Obesity'],  axis=1)
x.head()
# result_map = {
#     'Positive' : 1,
#     'Negative': 0
# }
# y = y.map(result_map)
y

Unnamed: 0,DiabeticClass,Obesity
0,Positive,Yes
1,Positive,No
2,Positive,No
3,Positive,No
4,Positive,Yes
...,...,...
515,Positive,No
516,Positive,No
517,Positive,Yes
518,Negative,No


In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [6]:
from lazypredict.Supervised import LazyClassifier
reg = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric= None)
models,predictions = reg.fit(x_train,x_test, y_train, y_test)
print(models)

100%|██████████| 29/29 [00:00<00:00, 63.71it/s]

Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: []





In [7]:
from sklearn.pipeline import Pipeline
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= 'mean')),
    ('scaler', StandardScaler())
])
#Treating boolean by ordinal
ord_categories = []
ord_categories_values = []
cols = x_train.columns
for i in cols:
    if x_train[i].dtypes == "object":
        ord_categories.append(i)
        ord_categories_values.append(x_train[i].unique())
ord_transform = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('encoder', OrdinalEncoder(categories= ord_categories_values))
])

In [8]:
#Preprocessing
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num_feature', num_transform, ['Age']),
    ('ordinal_feature', ord_transform, ord_categories)
])

In [9]:
#Train model
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MultiOutputClassifier(RandomForestClassifier()))
])
clf.fit(x_train,y_train)

In [10]:
#Prediction
y_predict = clf.predict(x_test)
for predicted, actual in zip(y_predict, y_test.values):
    print(f'Predicted: {predicted} ---> Actual: {actual}')

Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Positive' 'Yes'] ---> Actual: ['Positive' 'Yes']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Positive' 'Yes'] ---> Actual: ['Positive' 'Yes']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Negative' 'No'] ---> Actual: ['Negative' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Positive' 'No'] ---> Actual: ['Positive' 'No']
Predicted: ['Negative' 'No'] ---> Ac

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_classification_report(y_test, y_pred):
    num_classes = y_test.shape[1]
    report = {}

    for i in range(num_classes):
        class_name = f'Class {i}'  # Thay thế tên lớp phù hợp
        precision = precision_score(y_test[:, i], y_pred[:, i]) #Trong ngữ cảnh này, [:, i] có nghĩa là bạn đang truy cập vào tất cả các hàng (dấu : đại diện cho tất cả các hàng) và chỉ cụ thể cột thứ i của mảng hoặc ma trận.
        recall = recall_score(y_test[:, i], y_pred[:, i])
        f1 = f1_score(y_test[:, i], y_pred[:, i])

        report[class_name] = {'precision': precision, 'recall': recall, 'f1-score': f1}

    return report

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score

def custom_classification_report(y_true, y_pred):
    num_classes = y_true.shape[1]
    report = {}

    for i in range(num_classes):
        class_name = f'Class {i}'  # Thay thế tên lớp phù hợp
        precision = precision_score(y_true[:, i], y_pred[:, i], average='binary')
        recall = recall_score(y_true[:, i], y_pred[:, i], average='binary')
        f1 = f1_score(y_true[:, i], y_pred[:, i], average='binary')

        report[class_name] = {'precision': precision, 'recall': recall, 'f1-score': f1}

    return report
y_test_np = y_test.to_numpy()

# Sử dụng hàm custom_classification_report để đánh giá hiệu suất của mô hình
report = custom_classification_report(y_test_np, y_predict)
print(report)


ValueError: pos_label=1 is not a valid label. It should be one of ['Negative', 'Positive']

In [20]:
from sklearn.metrics import classification_report
y_test_np = y_test.to_numpy()
print(custom_classification_report(y_test_np, y_predict))

ValueError: pos_label=1 is not a valid label. It should be one of ['Negative', 'Positive']

In [13]:
y_test.shape[1]

2