<a href="https://colab.research.google.com/github/frm1789/100DaysOfPython/blob/main/DT_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CART vs Random Forest vs Assemble models

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset.

## CART

How CART (Classification and Regression Trees) works?

1. Feature Selection: Finds the best feature to split the dataset.
2. Dataset Splitting: Divides the dataset into subsets based on features.
3. Recursion: Continues splitting subsets until stopping criteria are met.
4. Tree Construction: Constructs a decision tree with nodes and leaves.
5. Tree Pruning (optional): Removes subtrees to prevent overfitting.
6. Prediction: Uses the tree to predict outcomes for new samples.


In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

df= pd.read_csv('diabetes.csv')


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, validation_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

In [3]:
def check_data_quality(data):
    # Check for NaN values
    nan_count = np.sum(np.isnan(data))

    # Check for outliers (assuming outliers are values more than 3 standard deviations away from the mean)
    mean = np.mean(data)
    std_dev = np.std(data)
    outliers_count = np.sum(np.abs(data - mean) > 3 * std_dev)


    return nan_count, outliers_count


In [4]:
def preprocess_data(df):
     # Replace zeros with NaN
    for col in df:
        df[col] = df[col].replace(0, np.nan)

    # Impute NaN values using KNN Imputer
    imputer = KNNImputer(n_neighbors=5)
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


    return imputed_df



## Out of the Box: DT, DT with max_depth and RF

In [5]:
y =  df['Outcome']
X =  df.drop('Outcome', axis=1)

In [6]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Out of the box: Accuracy: 0.7467532467532467

Accuracy: 0.7467532467532467


In [8]:
confusion_matrix(y_test, y_pred)

array([[75, 24],
       [15, 40]])

In [9]:
clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Max depth: 10
# Accuracy: 0.7532467532467533

Accuracy: 0.7532467532467533


In [10]:
confusion_matrix(y_test, y_pred)

array([[77, 22],
       [16, 39]])

In [11]:
rf_model = RandomForestClassifier(random_state=17)
rf_model.get_params()
'''
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False}
'''

#Accuracy before GridSearchCV
cv_results = cross_validate(rf_model, X, y, cv=5, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])
cv_results

#

#{'fit_time': array([0.40921831, 0.39054608, 0.42077708, 0.54269505, 0.37673283]),
# 'score_time': array([0.10616827, 0.05290127, 0.10224581, 0.04395771, 0.05506802]),
# 'test_accuracy': array([0.74675325, 0.72077922, 0.78571429, 0.81699346, 0.75816993]),
# 'test_precision': array([0.65306122, 0.6       , 0.78378378, 0.80487805, 0.69047619]),
# 'test_recall': array([0.59259259, 0.61111111, 0.53703704, 0.62264151, 0.54716981]),
# 'test_f1': array([0.62135922, 0.60550459, 0.63736264, 0.70212766, 0.61052632]),
# 'test_roc_auc': array([0.80907407, 0.78509259, 0.83601852, 0.87971698, 0.83141509])}




{'fit_time': array([0.92464256, 0.40524197, 0.36967802, 0.49381351, 0.56689739]),
 'score_time': array([0.11081028, 0.03660917, 0.06742048, 0.04082465, 0.10150886]),
 'test_accuracy': array([0.74675325, 0.72077922, 0.78571429, 0.81699346, 0.75816993]),
 'test_precision': array([0.65306122, 0.6       , 0.78378378, 0.80487805, 0.69047619]),
 'test_recall': array([0.59259259, 0.61111111, 0.53703704, 0.62264151, 0.54716981]),
 'test_f1': array([0.62135922, 0.60550459, 0.63736264, 0.70212766, 0.61052632]),
 'test_roc_auc': array([0.80907407, 0.78509259, 0.83601852, 0.87971698, 0.83141509])}

## Pre-Processing

In [12]:
zero_counts = (df.iloc[:, :-1] == 0).sum()
print(zero_counts)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64


In [13]:
# Replacing NaN with mean values
df["Glucose"].replace(0,df["Glucose"].mean(), inplace = True)
df["BloodPressure"].replace(0,df["BloodPressure"].mean(), inplace = True)
df["SkinThickness"].replace(0,df["SkinThickness"].mean(), inplace = True)
df["Insulin"].replace(0,df["Insulin"].mean(), inplace = True)
df["BMI"].replace(0,df["BMI"].mean(), inplace = True)

In [14]:
y =  df['Outcome']
X =  df.drop('Outcome', axis=1)

In [15]:
# Normalization
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

## Again the models

In [16]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Out of the box: Accuracy: 0.7467532467532467
# Pre processing: 0.7207792207792207

Accuracy: 0.7207792207792207


In [17]:
confusion_matrix(y_test, y_pred)

array([[73, 26],
       [17, 38]])

In [18]:
clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Max depth: 10
# Accuracy: 0.7532467532467533
# Accuracy: 0.7272727272727273


Accuracy: 0.7272727272727273


In [19]:
rf_model = RandomForestClassifier(random_state=17)
rf_model.get_params()
'''
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False}
'''

#Accuracy before GridSearchCV
cv_results = cross_validate(rf_model, X, y, cv=5, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])
cv_results

#

#{'fit_time': array([0.40921831, 0.39054608, 0.42077708, 0.54269505, 0.37673283]),
# 'score_time': array([0.10616827, 0.05290127, 0.10224581, 0.04395771, 0.05506802]),
# 'test_accuracy': array([0.74675325, 0.72077922, 0.78571429, 0.81699346, 0.75816993]),
# 'test_precision': array([0.65306122, 0.6       , 0.78378378, 0.80487805, 0.69047619]),
# 'test_recall': array([0.59259259, 0.61111111, 0.53703704, 0.62264151, 0.54716981]),
# 'test_f1': array([0.62135922, 0.60550459, 0.63736264, 0.70212766, 0.61052632]),
# 'test_roc_auc': array([0.80907407, 0.78509259, 0.83601852, 0.87971698, 0.83141509])}

#'test_accuracy': array([0.75974026, 0.74025974, 0.75974026, 0.82352941, 0.73202614]),


{'fit_time': array([0.24629617, 0.22035623, 0.21037507, 0.20779586, 0.20407557]),
 'score_time': array([0.02680731, 0.03101563, 0.02951336, 0.02539897, 0.02453732]),
 'test_accuracy': array([0.75974026, 0.74025974, 0.75974026, 0.82352941, 0.73202614]),
 'test_precision': array([0.66666667, 0.64      , 0.72972973, 0.80952381, 0.63636364]),
 'test_recall': array([0.62962963, 0.59259259, 0.5       , 0.64150943, 0.52830189]),
 'test_f1': array([0.64761905, 0.61538462, 0.59340659, 0.71578947, 0.57731959]),
 'test_roc_auc': array([0.81555556, 0.77768519, 0.83981481, 0.89641509, 0.80924528])}

## SMOTE

In [28]:
df= pd.read_csv('diabetes.csv')

In [29]:
# Replacing NaN with mean values
df["Glucose"].replace(0,df["Glucose"].mean(), inplace = True)
df["BloodPressure"].replace(0,df["BloodPressure"].mean(), inplace = True)
df["SkinThickness"].replace(0,df["SkinThickness"].mean(), inplace = True)
df["Insulin"].replace(0,df["Insulin"].mean(), inplace = True)
df["BMI"].replace(0,df["BMI"].mean(), inplace = True)

In [30]:
y =  df['Outcome']
X =  df.drop('Outcome', axis=1)

In [31]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(scaled_X, y)

In [32]:
X_res

array([[ 0.63994726,  0.86527574, -0.0210444 , ...,  0.16725546,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.20598931, -0.51658286, ..., -0.85153454,
        -0.36506078, -0.19067191],
       [ 1.23388019,  2.01597855, -0.68176235, ..., -1.33182125,
         0.60439732, -0.10558415],
       ...,
       [-0.12460603,  1.86277092,  0.95952058, ...,  0.44248846,
         1.9829989 ,  1.61427145],
       [-0.26693054,  0.19815313,  1.57742511, ...,  0.55362796,
         1.45659024, -0.11931871],
       [ 1.10398471,  1.01939681,  0.36611784, ...,  0.03080233,
         0.03432433,  1.09360178]])

In [33]:
y_res.value_counts()

1    500
0    500
Name: Outcome, dtype: int64

In [None]:
# Normalization
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

## Again the models

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Out of the box: Accuracy: 0.7467532467532467

Accuracy: 0.785


In [58]:
clf = DecisionTreeClassifier(random_state=42, max_depth=8)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Max depth: 10
# Accuracy: 0.7532467532467533

Accuracy: 0.81


In [57]:
clf_model = DecisionTreeClassifier(random_state=42, max_depth=8)
clf_model.get_params()

#Accuracy before GridSearchCV
cv_results = cross_validate(clf_model, X, y, cv=5, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])
cv_results

# Calcular la precisión promedio
accuracy_mean = np.mean(cv_results['test_accuracy'])
print("Precisión promedio:", accuracy_mean)


Precisión promedio: 0.717519735166794


In [36]:
confusion_matrix(y_test, y_pred)

array([[76, 23],
       [28, 73]])

In [54]:
rf_model = RandomForestClassifier(random_state=17)
rf_model.get_params()

#Accuracy before GridSearchCV
cv_results = cross_validate(rf_model, X, y, cv=5, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])
cv_results

# Calcular la precisión promedio
accuracy_mean = np.mean(cv_results['test_accuracy'])
print("Precisión promedio:", accuracy_mean)



Precisión promedio: 0.7630591630591631


In [None]:
confusion_matrix(y_test, y_pred)