# Exercise - Nursery Prediction

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
import statsmodels.api as sm
import pingouin as pg
import ppscore as pps
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from tqdm.auto import tqdm
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [None]:
data = pd.read_csv("./nursery.csv") # get it at https://www.kaggle.com/datasets/nimapourmoradi/nursery/data
data.head()

In [None]:
data.describe()

In [None]:
data.columns = data.columns.str.strip()
data.columns

In [None]:
plt.style.use('ggplot')

fig,axs = plt.subplots(nrows = 4, ncols = 2, figsize = (18, 20))
axs = axs.flat

for i,col in enumerate(data.columns[:-1]):
    axs[i].pie(x = list(Counter(data[col]).values()), 
               labels = list(Counter(data[col]).keys()),
               colors = [color for color in sns.color_palette('bright', len(data.columns))],
               autopct = "%.2f%%",
               shadow = True, 
               wedgeprops = dict(edgecolor = 'black', linewidth = 1.1), 
               textprops = dict(fontsize = 14, fontweight = 'bold', color = 'white'))
    
    axs[i].set_title(col, fontsize = 18, fontweight = 'bold', color = 'black')
    axs[i].legend()
    
fig.tight_layout()
fig.show()

In [None]:
sns.set_style("darkgrid")
fig,ax = plt.subplots(figsize = (7,3.5))
ax = sns.countplot(data, x = 'Target', palette = 'bright', edgecolor = 'black')

for bars in ax.containers:
    ax.bar_label(bars, fontsize = 8, fontweight = 'bold', color = 'black')
    
ax.set_xlabel("")
ax.set_title('Target', fontsize = 14, fontweight = 'bold', color = 'darkblue')
fig.show()

In [None]:
for dim_ in ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health']:
    print('\n-------------------------------\n')
    print('Dimension: {}'.format(dim_))
    fig = px.parallel_categories(data_frame = data, dimensions = [dim_, 'Target'])
    fig.show()

In [None]:
corr_matrix_pps = pps.matrix(data)[['x','y','ppscore']].pivot_table(index = 'y', 
                                                                    columns = 'x', 
                                                                    values = 'ppscore')

plt.figure(figsize = (20,10))
sns.heatmap(corr_matrix_pps, 
            cmap ='coolwarm', 
            annot = True, 
            linewidths = 1.1,
            square = True)
plt.title("Predictive Power Score(PPS)", fontsize = 18, fontweight = 'bold', color = 'black')
plt.show()

In [None]:
target_variable = 'Target'
X = data.drop(columns = [target_variable])
y = data[target_variable]

## Model Training

In [None]:
# We split the data set into training and testing.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [None]:
# Transformation of the target variable,
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
# We selected categorical variables to code.
categorical_predictors = X_train.select_dtypes(include = ['object', 'category']).columns.to_list()

In [None]:
# We define the transformer that we are going to apply.
transformer = [('ohe', 
                OneHotEncoder(drop = 'first', 
                              handle_unknown = 'ignore', 
                              sparse_output = False), 
                categorical_predictors)]

In [None]:
# We apply the transformation to the training and testing set.
preprocessor = ColumnTransformer(transformers = transformer, 
                                 remainder = 'passthrough', 
                                 n_jobs = -1, 
                                 verbose_feature_names_out = False).set_output(transform = 'pandas')

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print(f'X_train_prep: {X_train_prep.shape}')
print(f'X_test_prep: {X_test_prep.shape}')

In [None]:
print(f'y_train: {Counter(y_train)}')
print(f'y_test: {Counter(y_test)}')

In [None]:
clf1 = LogisticRegression(class_weight = 'balanced', 
                          random_state = 42,
                          multi_class = 'multinomial',
                          max_iter = 1500,
                          n_jobs = -1)

clf2 = KNeighborsClassifier(n_jobs = -1)

clf3 = GaussianNB()

clf4 = RandomForestClassifier(n_jobs = -1, 
                              class_weight = 'balanced', 
                              random_state = 42)

clf5 = ExtraTreesClassifier(n_jobs = -1, 
                            class_weight = 'balanced', 
                            bootstrap = True,
                            random_state = 42)

clf6 = XGBClassifier(random_state = 42, 
                     n_jobs = -1)

clf7 = CatBoostClassifier(random_seed = 42, 
                          auto_class_weights = 'Balanced', 
                          verbose = 0)


MODELS = [clf1, clf2, clf3, clf4, clf5, clf6, clf7]

In [None]:
for model in tqdm(MODELS):
    name = type(model).__name__
    model.fit(X_train_prep.to_numpy(dtype = np.float32), y_train)
    
    y_pred_train = model.predict(X_train_prep.to_numpy(dtype = np.float32))
    y_pred_test = model.predict(X_test_prep.to_numpy(dtype = np.float32))
    
    bal_acc_train = balanced_accuracy_score(y_train, y_pred_train)
    bal_acc_test = balanced_accuracy_score(y_test, y_pred_test)
    
    print("==" * 30)
    print(f"\033[1;33m {name} \033[0;m :\n") 
    print(f' Balanced ACC train: {bal_acc_train:.4f} | ', 
          f' Balanced ACC test: {bal_acc_test:.4f}\n')
    print("==" * 30)

## Model Predict

In [None]:
y_pred_train_final = clf7.predict(X_train_prep)
y_pred_test_final = clf7.predict(X_test_prep)

In [None]:
print("##" * 40)
print(" " * 25, "Classification Report Train")
print("##" * 40)
print(classification_report(y_train, y_pred_train_final, target_names = le.classes_))
print("")

print("##" * 40)
print(" " * 25, "Classification Report Test")
print("##" * 40)
print(classification_report(y_test, y_pred_test_final, target_names = le.classes_))

In [None]:
cf_mx_train = confusion_matrix(y_train, y_pred_train_final)
cf_mx_test = confusion_matrix(y_test, y_pred_test_final)

fig,axs = plt.subplots(nrows = 1, ncols = 2, figsize = (12,7))
axs = axs.flat

sns.heatmap(cf_mx_train, cmap = 'Reds', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, linewidths = 1.5, fmt = '', xticklabels = le.classes_, yticklabels = le.classes_, cbar = False, square = True, ax = axs[0])
sns.heatmap(cf_mx_test, cmap = 'Blues', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, linewidths = 1.5, fmt = '', xticklabels = le.classes_, yticklabels = le.classes_, cbar = False, square = True, ax = axs[1])
axs[0].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_title('Confusion Matrix Train', fontsize = 14, fontweight = "bold", color = "black")
axs[1].set_title('Confusion Matrix Test', fontsize = 14, fontweight = "bold", color = "black")

fig.tight_layout()
fig.show()