# Boosting Algorythm Project Tutorial

In [110]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from sklearn.metrics import mean_squared_error, r2_score

from pickle import dump

In [111]:
# A) Testing out performance using previous best performing -Optimized- Random Forest model 
# Recovering previously processed data without feature selection 
clean_train_raw = pd.read_csv('/workspaces/Boosting_Project_Tutorial_DianaM/data/processed/clean_train.csv')
clean_test_raw = pd.read_csv('/workspaces/Boosting_Project_Tutorial_DianaM/data/processed/clean_test.csv')

clean_train_raw.head()

Unnamed: 0,Glucose,BMI,BloodPressure,Age,has_diabetes
0,84.0,0.0,0.0,21.0,0
1,112.0,28.2,82.0,50.0,1
2,139.0,28.7,46.0,22.0,0
3,161.0,21.9,50.0,65.0,0
4,134.0,46.2,80.0,46.0,1


In [112]:
clean_test_raw.head()

Unnamed: 0,Glucose,BMI,BloodPressure,Age,has_diabetes
0,98,34.0,58,43,0
1,112,35.7,75,21,0
2,108,30.8,64,21,0
3,107,24.6,80,34,0
4,136,29.9,90,50,0


In [113]:
# A) Testing out performance using previous best performing -Optimized- Random Forest model 
# Resetting X_train and X_test Recovering without feature selection 

X_train = clean_train_raw.drop(columns = "has_diabetes")
X_test = clean_test_raw.drop(columns = "has_diabetes")
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')



X_train.head()

Unnamed: 0,Glucose,BMI,BloodPressure,Age
0,84.0,0.0,0.0,21.0
1,112.0,28.2,82.0,50.0
2,139.0,28.7,46.0,22.0
3,161.0,21.9,50.0,65.0
4,134.0,46.2,80.0,46.0


In [114]:
X_test.head()

Unnamed: 0,Glucose,BMI,BloodPressure,Age
0,98,34.0,58,43
1,112,35.7,75,21
2,108,30.8,64,21
3,107,24.6,80,34
4,136,29.9,90,50


In [115]:
# A) Testing out performance using previous best performing -Optimized- Random Forest model 
# Setting the best random forest model as the boosting input

random_forest_model = RandomForestClassifier(n_estimators = 1000, max_depth = 5, min_samples_split = 5,
                                             min_samples_leaf = 2, random_state = 42)
random_forest_model.fit(X_train, y_train)

# Get predictions for test set
y_pred_test = random_forest_model.predict(X_test)

# Get predictions for training set to evaluate training performance
y_pred_train = random_forest_model.predict(X_train) 
 

# Evaluation:

print(f"\n \n Random Forest Accuracy (test): {accuracy_score(y_test, y_pred_test)}")
print(f"\n Random Forest Accuracy (train): {accuracy_score(y_train, y_pred_train)}") 
print(f"\n \n Random Forest f1_score (test): {f1_score(y_test, y_pred_test, average='micro')}")
print(f"\n Random Forest f1_score (train): {f1_score(y_train, y_pred_train, average='micro')}") 
print(f"\n \n Random Forest precision (test): {precision_score(y_test, y_pred_test, average='micro')}") 
print(f"\n Random Forest precision (train): {precision_score(y_train, y_pred_train, average='micro')}") 
print(f"\n \n Random Forest recall (test): {recall_score(y_test, y_pred_test, average='micro')}")
print(f"\n Random Forest recall (train): {recall_score(y_train, y_pred_train, average='micro')}") 


  return fit_method(estimator, *args, **kwargs)



 
 Random Forest Accuracy (test): 0.7727272727272727

 Random Forest Accuracy (train): 0.8436482084690554

 
 Random Forest f1_score (test): 0.7727272727272727

 Random Forest f1_score (train): 0.8436482084690554

 
 Random Forest precision (test): 0.7727272727272727

 Random Forest precision (train): 0.8436482084690554

 
 Random Forest recall (test): 0.7727272727272727

 Random Forest recall (train): 0.8436482084690554


In [116]:
# A) Applying the boosting algorythm to the best performing previous random forest 

import xgboost as xgb
from sklearn.metrics import accuracy_score

# Convert predictions of random forest algorythm to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, base_margin=random_forest_model.predict_proba(X_train)[:,1])
dtest = xgb.DMatrix(X_test, label=y_test, base_margin=random_forest_model.predict_proba(X_test)[:,1])

params = {
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': 'logloss', 
    'learning_rate': 0.03,
    'max_depth': 3,  
    'n_estimators': 500  
}

# Train the XGBoost model
prev_xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
prev_y_pred_xgb_test = prev_xgb_model.predict(dtest)
y_pred_xgb_binary_test = [round(value) for value in prev_y_pred_xgb_test]

# Make predictions on the train set
prev_y_pred_xgb_train = prev_xgb_model.predict(dtrain)
y_pred_xgb_binary_train = [round(value) for value in prev_y_pred_xgb_train]



# Evaluation on test set
print(f"\nPrev. Random Forest to XGBClassifier Accuracy (test): {accuracy_score(y_test, y_pred_xgb_binary_test)}")
print(f"Prev. Random Forest to XGBClassifier f1_score (test): {f1_score(y_test, y_pred_xgb_binary_test, average='micro')}")
print(f"Prev. Random Forest to XGBClassifier precision (test): {precision_score(y_test, y_pred_xgb_binary_test, average='micro')}")
print(f"Prev. Random Forest to XGBClassifier recall (test): {recall_score(y_test, y_pred_xgb_binary_test, average='micro')}")

# Evaluation on train set
print(f"\nPrev. Random Forest to XGBClassifier Accuracy (train): {accuracy_score(y_train, y_pred_xgb_binary_train)}")
print(f"Prev. Random Forest to XGBClassifier f1_score (train): {f1_score(y_train, y_pred_xgb_binary_train, average='micro')}")
print(f"Prev. Random Forest to XGBClassifier precision (train): {precision_score(y_train, y_pred_xgb_binary_train, average='micro')}")
print(f"Prev. Random Forest to XGBClassifier recall (train): {recall_score(y_train, y_pred_xgb_binary_train, average='micro')}")




Prev. Random Forest to XGBClassifier Accuracy (test): 0.7857142857142857
Prev. Random Forest to XGBClassifier f1_score (test): 0.7857142857142857
Prev. Random Forest to XGBClassifier precision (test): 0.7857142857142857
Prev. Random Forest to XGBClassifier recall (test): 0.7857142857142857

Prev. Random Forest to XGBClassifier Accuracy (train): 0.8355048859934854
Prev. Random Forest to XGBClassifier f1_score (train): 0.8355048859934854
Prev. Random Forest to XGBClassifier precision (train): 0.8355048859934854
Prev. Random Forest to XGBClassifier recall (train): 0.8355048859934854


Parameters: { "n_estimators" } are not used.



In [117]:
dump(prev_xgb_model, open("../models/prev_random_forest_boosting_classifier_nestimators-500_learnrate-0.03_42.sav", "wb"))

In [118]:
# B) Applying the boost algorythm from scracth
# recovering the data

import pandas as pd

train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

X_train = train_data.drop(["has_diabetes"], axis = 1)
y_train = train_data["has_diabetes"]
X_test = test_data.drop(["has_diabetes"], axis = 1)
y_test = test_data["has_diabetes"]

In [119]:
# B) Applying the boost algorythm from scratch
# Runingn the Boosting algorythm from scracth


from xgboost import XGBClassifier

scratch_XGBClassifier_model = XGBClassifier(n_estimators = 600, learning_rate = 0.001, random_state = 42)
scratch_XGBClassifier_model.fit(X_train, y_train)


# Get predictions for test set
y_pred_test_XGBClassifier = scratch_XGBClassifier_model.predict(X_test)

# Get predictions for training set to evaluate training performance
y_pred_train_XGBClassifier = scratch_XGBClassifier_model.predict(X_train)  


# Evaluation:

print(f"\n \n XGBClassifier Accuracy (test): {accuracy_score(y_test, y_pred_test_XGBClassifier)}")
print(f"\n XGBClassifier Accuracy (train): {accuracy_score(y_train, y_pred_train_XGBClassifier)}") 
print(f"\n \n XGBClassifier f1_score (test): {f1_score(y_test, y_pred_test_XGBClassifier, average='micro')}")
print(f"\n XGBClassifier f1_score (train): {f1_score(y_train, y_pred_train_XGBClassifier, average='micro')}") 
print(f"\n \n RXGBClassifier precision (test): {precision_score(y_test, y_pred_test_XGBClassifier, average='micro')}") 
print(f"\n XGBClassifier precision (train): {precision_score(y_train, y_pred_train_XGBClassifier, average='micro')}") 
print(f"\n \n XGBClassifier recall (test): {recall_score(y_test, y_pred_test_XGBClassifier, average='micro')}")
print(f"\n XGBClassifier recall (train): {recall_score(y_train, y_pred_train_XGBClassifier, average='micro')}")

from pickle import dump

dump(scratch_XGBClassifier_model, open("../models/scractch_boosting_classifier_nestimators-600_learnrate-0.001_42.sav", "wb"))



 
 XGBClassifier Accuracy (test): 0.7662337662337663

 XGBClassifier Accuracy (train): 0.7850162866449512

 
 XGBClassifier f1_score (test): 0.7662337662337663

 XGBClassifier f1_score (train): 0.7850162866449512

 
 RXGBClassifier precision (test): 0.7662337662337663

 XGBClassifier precision (train): 0.7850162866449512

 
 XGBClassifier recall (test): 0.7662337662337663

 XGBClassifier recall (train): 0.7850162866449512


In [120]:
# RUNNING ALL THE PREVIOUS ALOGRYTHMS FOR FINAL COMPARSION

# Traing again the Decision Tree just with the selected features:

dt_model = DecisionTreeClassifier(criterion =  "entropy", max_depth = 5, min_samples_leaf = 4, min_samples_split = 2, random_state=42)
dt_model.fit(X_train, y_train)

# Model prediction (test)
dt_y_pred_2_test = dt_model.predict(X_test)


# Model prediction (train) 
dt_y_pred_2_train = dt_model.predict(X_train)


print(f"\n \n Optimized Decision Tree (test): {accuracy_score(y_test, dt_y_pred_2_test)}")
print(f"\n Optimized Decision Tree Accuracy (train): {accuracy_score(y_train, dt_y_pred_2_train)}") 
print(f"\n Optimized Decision Tree Accuracy overfitting: {accuracy_score(y_train, dt_y_pred_2_train)-accuracy_score(y_test, dt_y_pred_2_test)}") 
print(f"\n \n Optimized Decision Tree f1_score (test): {f1_score(y_test, dt_y_pred_2_test, average='micro')}") 
print(f"\n Optimized Decision Tree f1_score (train): {f1_score(y_train, dt_y_pred_2_train, average='micro')}") 
print(f"\n Optimized Decision Tree f1_score overfitting: {f1_score(y_train, dt_y_pred_2_train)-f1_score(y_test, dt_y_pred_2_test)}") 
print(f"\n \n Optimized Decision Tree Precision (test): {precision_score(y_test, dt_y_pred_2_test, average='micro')}") 
print(f"\n Optimized Decision Tree Precision (train): {precision_score(y_train, dt_y_pred_2_train, average='micro')}") 
print(f"\n Optimized Decision Tree Precision overfitting: {precision_score(y_train, dt_y_pred_2_train)-precision_score(y_test, dt_y_pred_2_test)}") 
print(f"\n \n Optimized Decision Tree Recall (test): {recall_score(y_test, dt_y_pred_2_test, average='micro')}") 
print(f"\n Optimized Decision Tree Recall (train): {recall_score(y_train, dt_y_pred_2_train, average='micro')}") 
print(f"\n Optimized Decision Tree Recall overfitting: {recall_score(y_train, dt_y_pred_2_train)-recall_score(y_test, dt_y_pred_2_test)}") 

from pickle import dump

dump(dt_model , open("grid_search_optimized_decision_tree_all_features_42.sav", "wb"))





 
 Optimized Decision Tree (test): 0.7142857142857143

 Optimized Decision Tree Accuracy (train): 0.8078175895765473

 Optimized Decision Tree Accuracy overfitting: 0.09353187529083296

 
 Optimized Decision Tree f1_score (test): 0.7142857142857143

 Optimized Decision Tree f1_score (train): 0.8078175895765473

 Optimized Decision Tree f1_score overfitting: 0.10234442452015668

 
 Optimized Decision Tree Precision (test): 0.7142857142857143

 Optimized Decision Tree Precision (train): 0.8078175895765473

 Optimized Decision Tree Precision overfitting: 0.10178049428647351

 
 Optimized Decision Tree Recall (test): 0.7142857142857143

 Optimized Decision Tree Recall (train): 0.8078175895765473

 Optimized Decision Tree Recall overfitting: 0.09961587708066577


In [121]:
# 11) Optimized Random Forest Model -  Manually


optimized_random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth = 5, min_samples_split = 5,
                                             min_samples_leaf = 2, random_state = 42)
optimized_random_forest_model.fit(X_train, y_train)

# Get predictions for test set
random_forest_model_y_pred_test = optimized_random_forest_model.predict(X_test)

# Get predictions for training set to evaluate training performance
random_forest_model_y_pred_train = optimized_random_forest_model.predict(X_train)  

# Evaluation:

print(f"\n \n Optimized Random Forest Accuracy (test): {accuracy_score(y_test, random_forest_model_y_pred_test)}")
print(f"\n Optimized Random Forest Accuracy (train): {accuracy_score(y_train, random_forest_model_y_pred_train)}") 
print(f"\n \n Optimized Random Forest f1_score (test): {f1_score(y_test, random_forest_model_y_pred_test, average='micro')}") 
print(f"\n Optimized Random Forest f1_score (train): {f1_score(y_train, random_forest_model_y_pred_train, average='micro')}") 
print(f"\n \n Optimized Random Forest precision (test): {precision_score(y_test, random_forest_model_y_pred_test, average='micro')}")
print(f"\n Optimized Random Forest precision (train): {precision_score(y_train, random_forest_model_y_pred_train, average='micro')}") 
print(f"\n \n Optimized Random Forest recall (test): {recall_score(y_test, random_forest_model_y_pred_test, average='micro')}") 
print(f"\n Optimized Random Forest recall (train): {recall_score(y_train, random_forest_model_y_pred_train, average='micro')}")

from pickle import dump

dump(optimized_random_forest_model , open("optimized_random_forest_model_1000estimators_max_depth5_min_samples_split5_min_samples_leaf2_42.sav", "wb"))




 
 Optimized Random Forest Accuracy (test): 0.7727272727272727

 Optimized Random Forest Accuracy (train): 0.8403908794788274

 
 Optimized Random Forest f1_score (test): 0.7727272727272727

 Optimized Random Forest f1_score (train): 0.8403908794788274

 
 Optimized Random Forest precision (test): 0.7727272727272727

 Optimized Random Forest precision (train): 0.8403908794788274

 
 Optimized Random Forest recall (test): 0.7727272727272727

 Optimized Random Forest recall (train): 0.8403908794788274


In [None]:
from sklearn.metrics import classification_report
import pandas as pd

def create_classification_report_df(y_true_train, y_pred_train, y_true_test, y_pred_test, target_names):
    """Creates a DataFrame for the classification report with overfitting."""

    report_train = classification_report(y_true_train, y_pred_train, target_names=target_names, output_dict=True)
    report_test = classification_report(y_true_test, y_pred_test, target_names=target_names, output_dict=True)

    df_report_train = pd.DataFrame(report_train).transpose()
    df_report_test = pd.DataFrame(report_test).transpose()

    # Rename columns to differentiate between train and test
    df_report_train = df_report_train.add_prefix('train_')
    df_report_test = df_report_test.add_prefix('test_')

    # Concatenate train and test DataFrames
    df_report = pd.concat([df_report_train, df_report_test], axis=1)

    # Add overfitting column (train - test) for f1-score
    df_report['overfitting'] = df_report['train_f1-score'] - df_report['test_f1-score']

    # Remove rows for individual target values (0 and 1)
    df_report = df_report[df_report.index.isin(['accuracy', 'macro avg', 'weighted avg'])]

    return df_report

# Assuming target names are 'no diabetes' and 'has diabetes'
target_names = ['no diabetes', 'has diabetes']

# Create DataFrames for each model
report_df_optimized_dt = create_classification_report_df(y_train, dt_y_pred_2_train, y_test, dt_y_pred_2_test, target_names)
report_df_optimized_rf = create_classification_report_df(y_train, random_forest_model_y_pred_train, y_test, random_forest_model_y_pred_test, target_names)
report_df_xgb = create_classification_report_df(y_train, y_pred_train_XGBClassifier, y_test, y_pred_test_XGBClassifier, target_names)
report_df_prev_xgb = create_classification_report_df(y_train, y_pred_xgb_binary_train, y_test, y_pred_xgb_binary_test, target_names) 


# Print the reports
print("\nClassification Report - Optimized Decision Tree:")
print(report_df_optimized_dt)

print("\nClassification Report - Optimized Random Forest:")
print(report_df_optimized_rf)

print("\nClassification Report - XGBClassifier:")
print(report_df_xgb)

print("\nClassification Report - Previous XGBoost Model:")  
print(report_df_prev_xgb)


Classification Report - Optimized Decision Tree:
              train_precision  train_recall  train_f1-score  train_support  \
accuracy             0.807818      0.807818        0.807818       0.807818   
macro avg            0.792345      0.816550        0.797902     614.000000   
weighted avg         0.826974      0.807818        0.811609     614.000000   

              test_precision  test_recall  test_f1-score  test_support  \
accuracy            0.714286     0.714286       0.714286      0.714286   
macro avg           0.704395     0.721212       0.704518    154.000000   
weighted avg        0.740661     0.714286       0.719867    154.000000   

              overfitting  
accuracy         0.093532  
macro avg        0.093385  
weighted avg     0.091741  

Classification Report - Optimized Random Forest:
              train_precision  train_recall  train_f1-score  train_support  \
accuracy             0.840391      0.840391        0.840391       0.840391   
macro avg            0

In [2]:
from pickle import load
import streamlit as st
import os

model_file = "optimized_random_forest_model_1000estimators_max_depth5_min_samples_split5_min_samples_leaf2_42.sav"

if not os.path.exists(model_file):
    st.error(f"The file of the model {model_file} is not found. Make sure it i in the system")
else:
    # Load the saved model
    with open(model_file, "rb") as file:
        model = load(file)
    # Class dictionary: 
    class_dict = {
    0: "Iris setosa",
    1: "Iris versicolor",
    2: "Iris virginica"
}

    # Set application title
    st.title("Iris - Model prediction")

    # Silders to collect user data entry

    val1 = st.slider("Petal width", min_value=0.1, max_value=2.5, step=0.1)
    val2 = st.slider("Petal length", min_value=1.0, max_value=7.0, step=0.1)
    val3 = st.slider("Sepal width", min_value=2.0, max_value=4.5, step=0.1)
    val4 = st.slider("Sepal length", min_value=4.0, max_value=8.0, step=0.1)

    # Precict button
    if st.button("Predict"):
        try: 
            prediction = model.predict([[val1, val2, val3, val4]])[0]
            pred_class = class_dict.get(prediction, "Unkown class")
            st.write("Prediction:", pred_class)
        except Exception as e:
            st.error(f"Failed predicting: {e}")

2024-12-22 17:36:13.193 
  command:

    streamlit run /home/vscode/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-12-22 17:36:13.197 Session state does not function when running a script without `streamlit run`
