# Advanced Classification

### Import necessary libraries

In [1]:
import pandas as pd
import seaborn as sns

### Import and display the dataset

We'll use diabetes dataset with missing value from GitHub

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/YBI-Foundation/Dataset/refs/heads/main/Diabetes%20Missing%20Data.csv")
df.head()

Unnamed: 0,Pregnant,Glucose,Diastolic_BP,Skin_Fold,Serum_Insulin,BMI,Diabetes_Pedigree,Age,Class
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

#### Check total missing value

In [4]:
df.isnull().sum()

Pregnant               0
Glucose                5
Diastolic_BP          35
Skin_Fold            227
Serum_Insulin        374
BMI                   11
Diabetes_Pedigree      0
Age                    0
Class                  0
dtype: int64

##### Method 1: Drop rows of missing value

In [5]:
df_drop_na = df.dropna()

In [6]:
df_drop_na

Unnamed: 0,Pregnant,Glucose,Diastolic_BP,Skin_Fold,Serum_Insulin,BMI,Diabetes_Pedigree,Age,Class
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
753,0,181.0,88.0,44.0,510.0,43.3,0.222,26,1
755,1,128.0,88.0,39.0,110.0,36.5,1.057,37,1
760,2,88.0,58.0,26.0,16.0,28.4,0.766,22,0
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0


In [7]:
df_drop_na.isnull().sum()

Pregnant             0
Glucose              0
Diastolic_BP         0
Skin_Fold            0
Serum_Insulin        0
BMI                  0
Diabetes_Pedigree    0
Age                  0
Class                0
dtype: int64

In [8]:
df_drop_na.shape

(392, 9)

#### Separate the features with the target

In [9]:
X = df_drop_na.iloc[:,0:-1] # features
y = df_drop_na.iloc[:,-1] # targets

#### Split the dataset into training and testing data

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### Import DecisionTreeClassifier and train the model

In [12]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

#### Evaluate the performance

In [16]:
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

result_acc = accuracy_score(y_test, y_pred)
print(f'Result: {result_acc}')

result_conf = confusion_matrix(y_test, y_pred)
print(f'Result: \n {result_conf}')

Result: 0.7711864406779662
Result: 
 [[67  8]
 [19 24]]


##### Method 2: Data Imputation

In [17]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, weights = 'uniform')
df_impute = pd.DataFrame(imputer.fit_transform(df))

In [18]:
df_impute.shape

(768, 9)

In [19]:
df_impute.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

In [20]:
X = df_impute.iloc[:,0:-1] # features
y = df_impute.iloc[:,-1] # targets

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [22]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)

In [24]:
result_acc = accuracy_score(y_test, y_pred)
print(f'Result: {result_acc}')

result_conf = confusion_matrix(y_test, y_pred)
print(f'Result: \n {result_conf}')

Result: 0.6320346320346321
Result: 
 [[106  49]
 [ 36  40]]


#### Handling imbalanced classes to further improve the accuracy

In [25]:
df['Class'].value_counts()

Class
0    500
1    268
Name: count, dtype: int64

In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
oversample = SMOTE()
X, y = oversample.fit_resample(X,y)

In [30]:
y.value_counts()

8
1.0    500
0.0    500
Name: count, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [32]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
result_acc = accuracy_score(y_test, y_pred)
print(f'Result: {result_acc}')

result_conf = confusion_matrix(y_test, y_pred)
print(f'Result: \n {result_conf}')

Result: 0.7633333333333333
Result: 
 [[119  28]
 [ 43 110]]
