# Package Installation 

In [1]:
# %pip install ucimlrepo
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install scikit-learn
# %pip install scipy
# %pip install joblib
# %pip install xgboost
# %pip install streamlit
# %pip install pickle 

# Import Library

In [2]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, LabelEncoder
from scipy.stats import zscore
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import *
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
import pickle

%matplotlib inline

# Loading Dataset

- read csv file path from uci repo
- open csv file with pandas, to use pandas function 

In [3]:
# get dataset from uci repo 
creditApproval = fetch_ucirepo(id=27) 

# metadata - get csv file
ca_csv = creditApproval.metadata.data_url
print(ca_csv) 

https://archive.ics.uci.edu/static/public/27/data.csv


## Reading Data into DataFrames

In [None]:
# open csv file with pandas dataframe 
ca_df = pd.read_csv(ca_csv)

ca_df

# Data Overview and Initial Inspection

## Data Shape ( Total Rows and Columns )

In [None]:
# get the number of row and column by pandas ( row, col )
ca_df.shape

## Column Names 

In [None]:
# display all column name 
ca_df.columns.values

#### Online resource explain

The output may appear a bit confusing at its first sight, but let's try to figure out the most important features of a credit card application. The features of this dataset have been anonymized to protect the privacy, but **[this blog](https://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html)** gives us a pretty good overview of the probable features. The probable features in a typical credit card application are Gender, Age, Debt, Married, BankCustomer, EducationLevel, Ethnicity, YearsEmployed, PriorDefault, Employed, CreditScore, DriversLicense, Citizen, ZipCode, Income and finally the ApprovalStatus. This gives us a pretty good starting point, and we can map these features with respect to the columns in the output.

As we can see from our first glance at the data, the dataset has a mixture of numerical and non-numerical features. This can be fixed with some preprocessing, but before we do that, let's learn about the dataset a bit more to see if there are other dataset issues that need to be fixed.

## Head of the Dataset 

In [None]:
# get first few row data 
ca_df.head(20)

## Tail of the Dataset 

In [None]:
# get last few row data 
ca_df.tail(20)

## Data Types

In [None]:
# check each col datatype 
ca_df.dtypes

## Basic Structure Overview 

In [None]:
# get the summary by dataframes structure 
ca_df.info()

#### NOTE 
- ALL the datatype is correct and same with what we display justnow 
- 4 is in float, 2 in integer and 10 in object 

## Summary Statistics 

In [None]:
# view Categorical data columns statistics result
ca_df.describe(include='object')

In [None]:
# view continuous data columns statistics result including min, max, mean and count 
ca_df.describe(exclude='object')

##### NOTE:
- freq is the most common value’s frequency

## Check for Duplicate Rows

In [None]:
# get duplicated row 
print("Duplicated rows > ", ca_df.duplicated().sum())

- no duplicated data exist

## Count of Unique Values per Column

In [None]:
# to understand each Categorical data column have what unique data 
for col in ca_df.select_dtypes('object').columns :
    print("Column Name > ", col)
    unique = ca_df[col].unique()
    print("No of Unique Data > ", len(unique))
    print("Unique Data Exist > ", unique)
    print("\n")

In [None]:
ca_df['A13'].unique()

# Identify Missing Values

In [None]:
ca_df.isnull().sum()

#### NOTE 
- total 16 column and 7 column have missing value
  - A1, 2, 4, 5, 6, 7, 14 ( MISSING VALUE )

## Visualize Missingness 

In [None]:
# ca_df['A1'].dtype
ca_df.dtypes.unique()

**[ How to change color for sns boxplot ](https://www.statology.org/seaborn-barplot-color/)**

In [None]:
missingData = ca_df.isna().sum()

continuous = ca_df.select_dtypes(exclude='object')
categorical = ca_df.select_dtypes(include='object')

mdColor = ['yellow' if ca_df[x].dtype == 'O' else 'lightgreen' if ca_df[x].dtype == 'int64' else 'orange' for x in missingData.index]

# plot graph
sns.barplot(x=missingData.index, y=missingData.values, palette=mdColor)
# sns.barplot(x=missingData.index, y=missingData.values, palette=mdColor, hue=ca_df.dtypes)

legend_handles = [plt.Line2D([0], [0], color='yellow', lw=10, label='Object'),
                  plt.Line2D([0], [0], color='lightgreen', lw=10, label='Int'),
                  plt.Line2D([0], [0], color='orange', lw=10, label='Float')]

plt.legend(title="Data Type", bbox_to_anchor=(1, 1), handles=legend_handles)
plt.title('Missing values count in each feature')
plt.xlabel('Feature')
plt.ylabel('Missing Values')

plt.show()

# Conclusion
print("NULL Object > ", ca_df.select_dtypes('object').isna().sum().sum())
print("NULL Int > ", ca_df.select_dtypes('int').isna().sum().sum())
print("NULL Float > ", ca_df.select_dtypes('float').isna().sum().sum())

# Data Visualization

## Data Distribution

### Pie Chart ( Categorical Data )

- categorical data
  - how many category
  - each category how many percentage

In [None]:
# show out all column with categorical data 
ca_df.describe(include='object')

In [None]:
colorMap = [
            '#7695FF',
            '#9DBDFF',
            '#FF9874', 
            '#FFD7C4',
            '#F6E96B', 
            '#BEDC74',
            '#A2CA71',
            '#FF8C9E',
            '#FF8225', 
            '#EF5A6F', 
            '#6C946F',
            '#E68369',
            '#DCA47C', 
            '#36BA98'
            ]

def colorShuffle():
    random.shuffle(colorMap)

len(colorMap)

In [None]:
# function for control the auto percentage format 
def autopct_format(percentage, values):
    # sum the category a and b together and get total 
    total = sum(values)

    # convert percentage back to number 
    val = int(round(percentage*total/100.0))
    
    return f'{percentage:.2f}%  ({val:d})' # .2%, two decimal point, the other 2 % is the format 

def plotPieChart(col, figsize):
    # get the category exist in the col and distribute of the data 
    dis = ca_df[col].value_counts()
    
    colorShuffle()

    plt.figure(figsize=figsize) 
    plt.pie(dis, labels=dis.index, autopct=lambda percentage: autopct_format(percentage, dis), colors=colorMap) 
    plt.title(f"Distribution of Category Data {col}")
    plt.legend(title=f"Category of {col}")
    plt.show() 

In [None]:
# create a pie chart 
plotPieChart('A1', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A4', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A5', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A6', (11,10))

In [None]:
# get the category exist in the col A7 and distribute of the data 
aseven_dis = ca_df['A7'].value_counts()

colorShuffle()

# set explode value
# explode the small distribution category so can clearly visualize it 
explode = (0, 0, 0, 0, 0.2, 0.2, 0.2, 0.2, 0.2)

# create a pie chart 
plt.figure(figsize=(11,10)) 
plt.pie(aseven_dis, labels=aseven_dis.index, autopct=lambda percentage: autopct_format(percentage, aseven_dis), colors=colorMap, explode=explode) 
plt.title("Distribution of Category Data A7")
plt.legend(title="Category of A7")
plt.show() 

In [None]:
# create a pie chart 
plotPieChart('A9', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A10', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A12', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A13', (10,8))

In [None]:
# create a pie chart 
plotPieChart('A16', (10,8))

### Heatmap ( Categorical Data )
- A9, 10, 12

> due to **A1 have missing value**, so here only show 678 data <br>
> if want to change can using A16 to change 

In [None]:
cpy_df = ca_df.copy()

# Combine A9 and A10 and A12 to create a unique key
cpy_df['A9_A10_A12'] = cpy_df['A9'].astype(str) + "_" + cpy_df['A10'].astype(str) + "_" + cpy_df['A12'].astype(str)

# Create a pivot table with A1 as values and A9_A10 as columns
pivot_table = cpy_df.pivot_table(index='A16', columns='A9_A10_A12', aggfunc='size', fill_value=0)

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, cmap="YlGnBu", annot=True, fmt='d')
plt.title("Heatmap of A16 by A9, A10, A12 Combination")
plt.xlabel("A9_A10_A12 Combination")
plt.ylabel("A16")
plt.show()

### Histogram with Box Plot ( Continuous data )
- To show both the distribution (via histogram) and summary statistics (via box plot) together for a comprehensive view
- Histogram ( shape of distribution )
- Box Plot ( outlier and spread )

In [None]:
# get all columns with continuous data 
ca_df.describe(exclude=["object"])

In [None]:
sns.set_theme(style="darkgrid")

def plotHistBP(col) :
    x = ca_df[col]

    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})

    # box plot 
    sns.boxplot(x=x, ax=ax_box, color='lightblue')
    
    # histogram 
    sns.histplot(x=x, bins=20, kde=True, ax=ax_hist, color='blue', edgecolor='black')
    
    ax_box.set(yticks=[])
    sns.despine(ax=ax_hist)
    sns.despine(ax=ax_box, left=True)
    
    plt.title(f"Distribution of Continuous data {col}")
    
    plt.show()

In [None]:
# create histogram with box plot graph
plotHistBP('A2')

In [None]:
# create histogram with box plot graph
plotHistBP('A3')

In [None]:
# create histogram with box plot graph
plotHistBP('A8')

In [None]:
# create histogram with box plot graph
plotHistBP('A11')

In [None]:
# create histogram with box plot graph
plotHistBP('A14')

In [None]:
# create histogram with box plot graph
plotHistBP('A15')

## Bivariate Analysis 

### Box Plots for Continuous vs Categorical Variables

In [None]:
def plotBoxPlot(y) :
    plt.figure(figsize=(8, 6))

    # Box plot
    sns.boxplot(x='A16', y=y, data=ca_df)

    # Overlay strip plot for individual data points
    sns.stripplot(x='A16', y=y, data=ca_df, color='red', jitter=True, size=3, alpha=1)
    # alpha = opacity  
    plt.title(f'Box Plot of {y} VS A16 with Data Points')
    plt.xlabel('A16 (Credit Approval: 0 = Yes, 1 = No)')
    plt.ylabel(f'{y} (Continuous Feature)')
    plt.show()

- y = A2 ( continuous )
- x = A16 ( categorical ) ( target ) 

In [None]:
plotBoxPlot('A2')

In [None]:
plotBoxPlot('A3')

In [None]:
plotBoxPlot('A8')

In [None]:
plotBoxPlot('A14')

### Bar Plot For Categorical vs Categorical

In [None]:
def plotBarPlot(y, figsize) :
    colorShuffle()

    color = colorMap.copy()
    
    # set x-axis and hue order
    x = ca_df['A16'].value_counts().sort_index().index
    hue = ca_df[y].value_counts().sort_index().index
    
    # create graph
    plt.figure(figsize=figsize) 
    sns.set_theme(style="darkgrid")
    ax = sns.countplot(data=ca_df, x="A16", hue=y, order=x, hue_order=hue, palette=[color.pop(int(random.random() * len(color))) for x in range(len(hue))])
    ax.legend(title=f"Category of {y}")
    plt.title(f'Box Plot of {y} VS A16 with Data Points')
    plt.xlabel('A16 (Credit Approval: 0 = Yes, 1 = No)')
    plt.ylabel(f'Number of {y} (Categorical Feature)')
    
    # Add the exact number of instances on each bar
    for container in ax.containers:
        ax.bar_label(container, fmt='%d', label_type='edge')
    
    plt.show()

In [None]:
plotBarPlot('A1', (10,10))

In [None]:
plotBarPlot('A4', (10,10))

In [None]:
plotBarPlot('A5', (10,10))

In [None]:
plotBarPlot('A6', (10,10))

In [None]:
plotBarPlot('A9', (10,10))

------------------------------------------------------------------------
Data Preparation Start 

# Data Cleaning

## Handling Missing Value 

### Imputing Missing Continuous Variable

In [None]:
# view the distribution of A2 and A14 
plotHistBP('A2')
plotHistBP('A14')

- both A2 and A14 are right skewed so to prevent be affect by outlier, we are going to use MEDIAN

In [None]:
# prepare function for imputing process
def getNullRowByCol(col) : 
    # show all columns that input (eg A2) is NaN
    # get True False map
    return pd.isnull(ca_df[col])

def replaceByMedian(col) : 
    # get the median of col
    median = ca_df[col].median()

    ca_df[col] = ca_df[col].fillna(median)

def replaceByLOCF(col) : 
    # using Last Observation Carried Forward (LOCF) 
    ca_df[col] = ca_df[col].ffill()

#### A2

In [None]:
bo = getNullRowByCol('A2')
    
# only display True
ca_df[bo]

In [None]:
# CALL function to replace
replaceByMedian('A2')

# take a look on the result after imputing 
ca_df[bo]

#### A14

In [None]:
bo = getNullRowByCol('A14')

# only display True
ca_df[bo]

In [None]:
# CALL function to replace
replaceByMedian('A14')

# take a look on the result after imputing 
ca_df[bo]

### Imputing Missing Categorical Variable

#### A1

In [None]:
bo = getNullRowByCol('A1')

# only display true
ca_df[bo]

In [None]:
replaceByLOCF('A1')

ca_df[bo]

#### A4

In [None]:
bo = getNullRowByCol('A4')

# only display true
ca_df[bo]

In [None]:
replaceByLOCF('A4')

ca_df[bo]

#### A5

In [None]:
bo = getNullRowByCol('A5')

# only display true
ca_df[bo]

In [None]:
replaceByLOCF('A5')

ca_df[bo]

#### A6

In [None]:
bo = getNullRowByCol('A6')

# only display true
ca_df[bo]

In [None]:
replaceByLOCF('A6')

ca_df[bo]

#### A7

In [None]:
bo = getNullRowByCol('A7')

# only display true
ca_df[bo]

In [None]:
replaceByLOCF('A7')

ca_df[bo]

### Result after Handling Missing Value

In [None]:
print("NULL Object > ", ca_df.select_dtypes('object').isna().sum().sum())
print("NULL Int > ", ca_df.select_dtypes('int').isna().sum().sum())
print("NULL Float > ", ca_df.select_dtypes('float').isna().sum().sum())

# Data Preprocessing

## Encode Categorical Data

In [None]:
# create a copy set for backup purpose -- have drop the feature
backup_df = ca_df.copy()

def getCopySet() : 
    return backup_df.copy()

In [None]:
# # reset the ca_df for testing purpose 
# ca_df = getCopySet()
# ca_df

In [None]:
# select all data type in object 
encodeList = ca_df.select_dtypes(include="object")

featureList = encodeList.drop(columns='A16')
targetList = encodeList['A16']

encodeList

### Encoding Categorical Feature ( Ordinal Encoder )

In [None]:
# encode categorical data with using OrdinalEncoder
oencoder = OrdinalEncoder()

categorical_columns = featureList.columns

# encode in onces to prevent overwrite the status of encoder 
ca_df[categorical_columns] = oencoder.fit_transform(ca_df[categorical_columns])

ca_df

In [None]:
# take a look on the encoder feature label for ensure all the feature at inside  
print(oencoder.feature_names_in_)

In [None]:
# store each feature value have been encoded to what value 
mappings = {}

for col in categorical_columns:
    # Store the mapping of original values to encoded values
    # enumerate = convert to a list that have index number 
    # code, category = index and orginalLabel
    mappings[col] = {category: code for code, category in enumerate(oencoder.categories_[categorical_columns.get_loc(col)])}

### Encoding Categorical Target ( Label Encoder )

In [None]:
targetList.name

In [None]:
lencoder = LabelEncoder()

ca_df[targetList.name] = lencoder.fit_transform(targetList)

In [None]:
# show the updated result 
ca_df

### Label for Each Encode data

In [None]:
# Display the mappings 
# each value have been encode to what value 
for col, mapping in mappings.items():
    print(f"Column: {col}")
    for original, encoded in mapping.items():
        print(f"  {original} -> {encoded}")

In [None]:
print(f"Column: A16")
for original, encoded in zip(lencoder.classes_, range(len(lencoder.classes_))):
    print(f"  {original} -> {encoded}")

### Export Encoder For Further Use

In [None]:
print(f"{joblib.dump(oencoder, 'pklFolder/featureEncoder.pkl')}")

print(f"{joblib.dump(lencoder, 'pklFolder/targetEncoder.pkl')}")

## Data Discovery and Profiling

### Correlation Analysis with Pairplots

https://study.com/academy/lesson/scatter-plot-and-correlation-definition-example-analysis.html#:~:text=A%20scatterplot%20with%20no%20correlation,pattern%2C%20neither%20positive%20nor%20negative.&text=The%20scatterplot%20shows%20haphazard%20points%20that%20follow%20no%20direction. 

In [None]:
plt.figure(figsize=(10, 10))
sns.pairplot(ca_df)
plt.show()

### Correlation Matrix with Heatmap

- +1: Perfect positive correlation (when one variable increases, the other increases proportionally).
- 0: No correlation (no relationship between variables).
- -1: Perfect negative correlation (when one variable increases, the other decreases proportionally).

In [None]:
correlation_matrix = ca_df.corr()

In [None]:
# use heatmap to know the correlation between x
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True)
plt.show()

### Remove weak relationship feature 

In [None]:
# get the range of relationship need to be keep 
strong_corr = correlation_matrix[(correlation_matrix > 0.04) | (correlation_matrix < -0.04)]

# convert all weak relationship become true by using true false map
checkWeak = strong_corr.isnull()

# do filter on only get the weak relationship row
checkWeak = checkWeak[checkWeak['A16'] == True]

# view result
checkWeak

In [None]:
# drop columns
ca_df = ca_df.drop(columns=checkWeak.index)

ca_df

## Re-encode Categorical Data 
- re-encode due to drop the columns

In [None]:
# use backup to get the dataframe that consist value havent be encode 
new_df = backup_df[ca_df.columns].copy()

# select all data type in object 
encodeList = new_df.select_dtypes(include="object")

featureList = encodeList.drop(columns='A16')
targetList = encodeList['A16']

encodeList

### Encoding Categorical Feature ( Ordinal Encoder )
- ordinal encoder is to encode the categorical feature from A1 - A15

In [None]:
# encode categorical data with using OrdinalEncoder
oencoder = OrdinalEncoder()

categorical_columns = featureList.columns

# encode in onces to prevent overwrite the status of encoder 
new_df[categorical_columns] = oencoder.fit_transform(new_df[categorical_columns])

new_df

In [None]:
# take a look on the encoder feature label for ensure all the feature at inside  
print(oencoder.feature_names_in_)

In [None]:
# store each feature value have been encoded to what value 
mappings = {}

for col in categorical_columns:
    # Store the mapping of original values to encoded values
    # enumerate = convert to a list that have index number 
    # code, category = index and orginalLabel
    mappings[col] = {category: code for code, category in enumerate(oencoder.categories_[categorical_columns.get_loc(col)])}

### Encoding Categorical Target ( Label Encoder )

In [None]:
targetList.name

In [None]:
# no need to refit again
new_df[targetList.name] = lencoder.transform(targetList)

In [None]:
# show the updated result 
new_df

### Overwrite the old dataframe to ensure consistent

In [None]:
# overwrite the ca_df become new_df
ca_df = new_df

print(f"The new ordinal encoder feature name consist is > {oencoder.feature_names_in_} \n")
print(f"New after drop weak feature and re-encode the data, the dataframe show as > ")
display(ca_df)

### Label for Each Encode data

In [None]:
# Display the mappings 
# each value have been encode to what value 
for col, mapping in mappings.items():
    print(f"Column: {col}")
    for original, encoded in mapping.items():
        print(f"  {original} -> {encoded}")

In [None]:
print(f"Column: A16")
for original, encoded in zip(lencoder.classes_, range(len(lencoder.classes_))):
    print(f"  {original} -> {encoded}")

### Export Encoder For Further Use

In [None]:
print(f"{joblib.dump(oencoder, 'pklFolder/featureEncoder.pkl')}")

print(f"{joblib.dump(lencoder, 'pklFolder/targetEncoder.pkl')}")

## Outlier Detection and Treatment

### Detecting Outlier

In [None]:
# Compute Z-scores for numerical columns
z_scores = np.abs(zscore(ca_df.select_dtypes(include=['float64', 'int64'])))

# set threshold become 3.5 due to 3 have too much of outlier 
threshold = 3.5
outliers_zscore = (z_scores > threshold)

# Output the outlier rows
outlier_indices = np.where(outliers_zscore)

print("Outlier data points based on Z-score method:")
# outlier_indices = ([row number of outlier],[??])
process_df = ca_df.iloc[outlier_indices[0]]

display(process_df)
print(f"Total Number of Row of Outlier > {process_df.index.size}")

check dont affect the categorical data that only hava lesser value 
- A13 > p ( 1 )
- A7 > o ( 6 ), n ( 5 ), dd ( 1 ), z ( 8 ), j ( 4 ) ( drop )
- A6 > r ( 11 ), j ( 7 )
- A5 > gg ( 1 )
- A4 > l ( 0 )

In [None]:
# set list due to data already been encoded
notInclude = [["A13", [ 1 ]],
              ["A6", [11, 7]],
              ["A5", [1]],
              ["A4", [0]]]

process_df = ca_df.iloc[outlier_indices[0]]

for column, values_to_exclude in notInclude:
    # exclude rows where the column value is in the values_to_exclude list
    process_df = process_df[~process_df[column].isin(values_to_exclude)]

display(process_df)
print(f"Total Number of Row of Outlier after Update > {process_df.index.size}")

### Handling Outlier

In [None]:
# drop all 32 rows 
ca_df = ca_df.drop(index=process_df.index)

ca_df

## Splitting Data by using Train_Test Split
- Stratified Sampling included 

In [None]:
# shuffle the A16 column data 
ca_df = shuffle(ca_df)

In [None]:
# export as csv file for further use 
ca_df.to_csv('csvDataFile/cleanData.csv', index=False) 

------------------------------------------------------------------
# Test Model start from here 

In [None]:
# read again 
# clean_df = pd.read_csv('csvDataFile/cleanData.csv')
clean_df = pd.read_csv('cleanData.csv')

clean_df.shape

In [None]:
# drop target columns
x = clean_df.drop(columns=['A16']) 
y = clean_df['A16']

In [None]:
x.head()

In [None]:
x = x.values

x

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.20, random_state=0)
# training using x_train and y_train

In [None]:
print(f"Size of x_train > {len(x_train)}")
print(f"Size of x_test > {len(x_test)}")

## Data Scailing by using MinMaxScaler

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))

rescaledX_train = scaler.fit_transform(x_train)
rescaledX_test = scaler.transform(x_test)

print(rescaledX_train)

### Export Scaler For Future Use

In [None]:
joblib.dump(scaler, 'pklFolder/scaler.pkl')

# Modeling

In [None]:
sns.set_theme(style="white", palette=None)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import *
import joblib

In [None]:
# create instance 
model = LogisticRegression()

# fit set into model - train
model.fit(rescaledX_train, y_train)

In [None]:
# use test set and get the predicted result as return 
yPredTrain = model.predict(rescaledX_train)
yPredTest = model.predict(rescaledX_test)

### Check accuracy ( Score, RMSE, Confusion Matrix )

In [None]:
# display the RMSE and accuracy for both test and train 
print(f"Acurracy of Train Data > {accuracy_score(y_train, yPredTrain)}")
print(f"RMSE for Train Data    > {np.sqrt(mean_squared_error(y_train, yPredTrain))}")
print("\n")
print(f"Acurracy of Test Data  > {accuracy_score(y_test, yPredTest)}")
print(f"RMSE for Test Data     > {np.sqrt(mean_squared_error(y_test, yPredTest))}")

In [None]:
# show confusion matrix 
# get confusion matrix
confusionMatrix = confusion_matrix(y_test, yPredTest)

plt.figure()
diagram = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=['+', '-'])
diagram.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, yPredTest))

### GridSearchCV

In [None]:
# define the parameter grid
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Different values for the regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Different types of penalties
    'solver': ['saga', 'newton-cg', 'liblinear', 'sag', 'lbfgs'],  
    'max_iter': [100, 150, 200, 500, 1000], 
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
    'tol' : [0.01, 0.001, 0.0001]
}

# liblinear suitable for small but sag and saga is suitable for largest and more faster 
# but for liblinear is not suitable for multiclass problem, only have newton-cg, lbfgs(default), sag and saga suitable for this case 

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1)
# cv = 5 > 5-fold cross-validation
# verbose = 1 > show progress
# n_jobs = -1 > use all available cores

In [None]:
# fit in the data to find best comparision
grid_model_result = grid_search.fit(rescaledX_train, y_train)

In [None]:
# display the best parameter
print(f"Best C        > {grid_model_result.best_estimator_.C}")
print(f"Best penalty  > {grid_model_result.best_estimator_.penalty}")
print(f"Best solver   > {grid_model_result.best_estimator_.solver}")
print(f"Best max_iter > {grid_model_result.best_estimator_.max_iter}")
print(f"Best l1_ratio > {grid_model_result.best_estimator_.l1_ratio}")
print(f"Best tol      > {grid_model_result.best_estimator_.tol}")

print(f"The best score get from grid > {grid_model_result.best_score_}")

In [None]:
# show testing performance 
yGridPredTrain = grid_model_result.predict(rescaledX_train)
yGridPredTest = grid_model_result.predict(rescaledX_test)

In [None]:
# display the RMSE and accuracy for both test and train 
print(f"Acurracy of Train Data after Grid > {accuracy_score(y_train, yGridPredTrain)}")
print(f"RMSE for Train Data after Grid    > {np.sqrt(mean_squared_error(y_train, yPredTrain))}")
print("\n")
print(f"Acurracy of Test Data after Grid  > {accuracy_score(y_test, yGridPredTest)}")
print(f"RMSE for Test Data after Grid     > {np.sqrt(mean_squared_error(y_test, yGridPredTest))}")

In [None]:
# show confusion matrix 
# get confusion matrix
confusionMatrix = confusion_matrix(y_test, yGridPredTest)

plt.figure()
diagram = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=['+', '-'])
diagram.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, yGridPredTest))

### Export For Future Use

In [None]:
joblib.dump(grid_model_result, 'pklFolder/model/LogisticRegression.pkl')

## Support Vector Classification

- small value of c >>> will choose a larger margin hyperplane
- 
large value of c >>> will choose a smaller margin hyperplan
- 
However, the smaller the margin of the hyperplane, the higher the possibility to get a misclassified result. (Low accuracy)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
clf = SVC()
clf.fit(rescaledX_train, y_train)

In [None]:
y_predTrain = clf.predict(rescaledX_train)
y_predTest = clf.predict(rescaledX_test)

## Check Accuracy (Score, RMSE, Confusion Matrix)

In [None]:
print(f"Accuracy of Train Data > {accuracy_score(y_train, y_predTrain)}")
print(f"RMSE for Train Data    > {np.sqrt(mean_suqared_error(y_train, y_predTrain))}")
print("\n")
print(f"Accuracy of Test Data  > {accuracy_score(y_test, y_predTest)}")
print(f"RMSE for Test Data     > {np.sqrt(mean_suqared_error(y_test, y_predTest))}")

In [None]:
cm = confusion_matrix(y_test, y_predTest)

plt.figure()
cm_diagram = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['+'. '-'])
cm_diagram.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, y_predTest))

### GridSearchCV

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 45, 100, 1000],
             'gamma': [1, 10, 45, 100, 1000],
             'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, n_jobs=-1)

grid.fit(rescaledX_train, y_train)

In [None]:
print(f"Best C          > {grid.best_estimator_.C}")
print(f"Best gamma      > {grid.best_estimator_.gamma}")
print(f"Best kernel     > {grid.best_estimator_.kernel}")

print(f"The best score get from grid > {grid.best_score_}")

In [None]:
grid_PredTrain = grid.predict(rescaledX_train)
grid_PredTest = grid.predict(rescaledX_test)

In [None]:
print(f"Accuracy of Train Data > {accuracy_score(y_train, grid_predTrain)}")
print(f"RMSE for Train Data    > {np.sqrt(mean_suqared_error(y_train, grid_predTrain))}")
print("\n")
print(f"Accuracy of Test Data  > {accuracy_score(y_test, grid_predTest)}")
print(f"RMSE for Test Data     > {np.sqrt(mean_suqared_error(y_test, grid_predTest))}")

In [None]:
cm = confusion_matrix(y_test, grid_predTest)

plt.figure()
cm_diagram = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['+'. '-'])
cm_diagram.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, grid_predTest))

### Export For Future Use

In [None]:
svc_pickle_out = open("pklFolder/model/svc_classifier.pkl", "wb")

pickle.dump(grid, svc_pickle_out)
svc_pickle_out.close()

## Random Forest Classification

In [None]:
# Import RandomForestClassifier and RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
import pickle

In [None]:
random_forest_model_default = RandomForestClassifier()
random_forest_model_default = random_forest_model_default.fit(rescaledX_train, y_train)

y_pred_rf_train = random_forest_model_default.predict(rescaledX_train)
y_pred_rf_test = random_forest_model_default.predict(rescaledX_test)

### Check accuracy ( Score, RMSE, Confusion Matrix )

In [None]:
# display the RMSE and accuracy for both test and train 
print("Acurracy of Train Data with default parameters  > ", accuracy_score(y_train, y_pred_rf_train))
print("RMSE for Train Data with default parameters     > ", np.sqrt(mean_squared_error(y_train, y_pred_rf_train)))
print("\n")
print("Acurracy of Test Data with default parameters   > ", accuracy_score(y_test, y_pred_rf_test))
print("RMSE for Test Data with default parameters      > ", np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))

In [None]:
confusion_rf_default = confusion_matrix(y_test, y_pred_rf_test)

diagram = ConfusionMatrixDisplay(confusion_matrix=confusion_rf_default, display_labels=['+', '-'])
diagram.plot(cmap=plt.cm.YlGnBu)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, y_pred_rf_test))

### Parameter Distributions

In [None]:
# Do hyperparameter tuning

#Number of trees in random forest
n_estimators = [int(i) for i in range(50,300,50)]

#Minimum number of samples required to split a node
min_samples_split = [2, 4, 8]

#Maximum number of levels in tree
max_depth = [None, 10, 20, 30, 40]

#Measures the quality of each split
criterion = ['gini','entropy']

param_grid = {
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'max_depth': max_depth,
    'criterion': criterion
}

### RandomSearchCV

In [None]:
random_forest_model_RS = RandomForestClassifier()

random_search = RandomizedSearchCV(estimator = random_forest_model_RS, param_distributions = param_grid, cv = 10, scoring = 'accuracy', verbose = 1, n_jobs = -1)
random_forest_model_RS = random_search.fit(rescaledX_train, y_train)

In [None]:
# Get the best parameters and the best score
print("Best n_estimators        > ", random_forest_model_RS.best_params_['n_estimators'])
print("Best min_samples_split   > ", random_forest_model_RS.best_params_['min_samples_split'])
print("Best max_depth           > ", random_forest_model_RS.best_params_['max_depth'])
print("Best criterion           > ", random_forest_model_RS.best_params_['criterion'])

print("The best score get from random search > ", random_forest_model_RS.best_score_)

In [None]:
#Predict train and test
y_pred_rfrs_train = random_forest_model_RS.predict(rescaledX_train)
y_pred_rfrs_test = random_forest_model_RS.predict(rescaledX_test)

In [None]:
# display the RMSE and accuracy for both test and train 
print("Acurracy of Train Data after Random Search  > ", accuracy_score(y_train, y_pred_rfrs_train))
print("RMSE for Train Data after Random Search     > ", np.sqrt(mean_squared_error(y_train, y_pred_rfrs_train)))
print("\n")
print("Acurracy of Test Data after Random Search   > ", accuracy_score(y_test, y_pred_rfrs_test))
print("RMSE for Test Data after Random Search      > ", np.sqrt(mean_squared_error(y_test, y_pred_rfrs_test)))

In [None]:
confusion_rf_RS = confusion_matrix(y_test, y_pred_rf_test)

diagram = ConfusionMatrixDisplay(confusion_matrix=confusion_rf_RS, display_labels=['+', '-'])
diagram.plot(cmap=plt.cm.YlGnBu)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, y_pred_rfrs_test))

### GridSearchCV

In [None]:
random_forest_model_GS = RandomForestClassifier()

grid_search = GridSearchCV(estimator = random_forest_model_GS, param_grid = param_grid, cv = 5, scoring='accuracy', n_jobs=-1)
random_forest_model_GS = grid_search.fit(rescaledX_train, y_train)

In [None]:
# Get the best parameters and the best score
print("Best n_estimators        > ", random_forest_model_GS.best_params_['n_estimators'])
print("Best min_samples_split   > ", random_forest_model_GS.best_params_['min_samples_split'])
print("Best max_depth           > ", random_forest_model_GS.best_params_['max_depth'])
print("Best criterion           > ", random_forest_model_GS.best_params_['criterion'])

print("The best score get from random search > ", random_forest_model_RS.best_score_)

In [None]:
#Predict train and test
y_pred_rfgs_train = random_forest_model_GS.predict(rescaledX_train)
y_pred_rfgs_test = random_forest_model_GS.predict(rescaledX_test)

In [None]:
# display the RMSE and accuracy for both test and train 
print("Acurracy of Train Data after Grid Search  > ", accuracy_score(y_train, y_pred_rfgs_train))
print("RMSE for Train Data after Grid Search     > ", np.sqrt(mean_squared_error(y_train, y_pred_rfgs_train)))
print("\n")
print("Acurracy of Test Data after Grid Search   > ", accuracy_score(y_test, y_pred_rfgs_test))
print("RMSE for Test Data after Grid Search      > ", np.sqrt(mean_squared_error(y_test, y_pred_rfgs_test)))

In [None]:
confusion_rf_GS = confusion_matrix(y_test, y_pred_rfgs_test)

diagram = ConfusionMatrixDisplay(confusion_matrix=confusion_rf_GS, display_labels=['+', '-'])
diagram.plot(cmap=plt.cm.YlGnBu)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
print(classification_report(y_test, y_pred_rfgs_test))

### Export For Future Use

In [None]:
# pickle.dump(random_forest_model_default, open("pklFolder/model/RandomForestClassifier.pkl", 'wb'))
pickle.dump(random_forest_model_default, open("pklFolder/model/RandomForestClassifier.pkl", 'wb'))

## K Nearest Neighbourhood y_test

In [None]:
#import libraries
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
#implement KNN model
knn = KNeighborsClassifier()

#use GridSearchCV to tune hyperparameters
param_grid = {
    'n_neighbors': range(1, 6),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
}

knn_grid = GridSearchCV(knn, param_grid, cv=5, n_jobs=-1)
knn_grid.fit(rescaledX_train, y_train)

#best parameters and evaluate model
print("Best parameters:")
print(knn_grid.best_params_)

#get the best model from GridSearchCV
best_knn = knn_grid.best_estimator_

#evaluate on training data
y_train_pred = best_knn.predict(rescaledX_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

#evaluate on test data
y_test_pred = best_knn.predict(rescaledX_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

### Check accuracy ( Score, RMSE, Confusion Matrix )

In [None]:
print("Accuracy Train Data:", train_accuracy)
print("Accuracy Test Data:", test_accuracy)

In [None]:
#calculate accuracy, precision, recall and F1 score
accuracy_knn = accuracy_score(y_test, y_test_pred) * 100
print('Accuracy  : %.5f' % accuracy_knn)

In [None]:
#caluclate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("RMSE:", rmse)

In [None]:
#display confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('Actual', fontsize=13)
plt.xlabel('Predicted', fontsize=13)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

In [None]:
#display classification report
print(classification_report(y_test, y_test_pred))

### Export For Future Use

In [None]:
#save the model to knn.pkl file
joblib.dump(best_knn, 'pklFolder/model/knn.pkl')
print("Model saved to 'knn.pkl'")

## XGBOOST Classification

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *

In [None]:
#implement ur model (train)
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=4, learning_rate=0.1)
model.fit(rescaledX_train, y_train)

In [None]:
y_pred = model.predict(rescaledX_test)

In [None]:
#calculate accuracy of model (test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Initial Accuracy: {accuracy * 100:.2f}%")

### GridSearchCV

In [None]:
# Step 3: Using GridSearchCV to tune parameters
param_grid = {
    'n_estimators': [200, 250, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.02],
    'subsample': [0.75, 0.8],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 2],
    'reg_alpha': [0, 0.05],
    'reg_lambda': [1, 1.2]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic'), 
                           param_grid=param_grid, n_jobs=-1,
                           scoring='accuracy', 
                           cv=5, 
                           verbose=1)

grid_search.fit(rescaledX_train, y_train)

In [None]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

In [None]:
# Train the model with the best parameters
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(rescaledX_train, y_train)

# Predict and calculate the accuracy with the best model
y_pred_best = best_model.predict(rescaledX_test)
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Accuracy after GridSearchCV: {best_accuracy * 100:.2f}%")

### Check accuracy ( Score, RMSE, Confusion Matrix )

In [None]:
y_pred_prob = best_model.predict_proba(rescaledX_test)[:, 1]

rmse = np.sqrt(mean_squared_error(y_test, y_pred_prob))

print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
# MAE (Mean Absolute Error) - On probabilities, if needed
mae = mean_absolute_error(y_test, y_pred_prob)
print(f'MAE: {mae}')

In [None]:
# Step 6: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### Export For Future Use

In [None]:
# Step 5: Save the final model to a .pkl file using joblib
joblib.dump(best_model, 'pklFolder/model/xgboost.pkl')
print("Model saved as 'xgboost.pkl'")

-----------------------------------------------------
Evaluation Start 

# Model Performance Comparision

In [None]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def print_scores(y, y_pred):
    ac, pr, rc, f1 = accuracy_score(y, y_pred)*100, precision_score(y, y_pred)*100, recall_score(y, y_pred)*100, f1_score(y, y_pred)*100
    
    return {'ac': ac, 'pr': pr, 'rc': rc, 'f1': f1} # get all accurancy, precision, recall and f1 

In [None]:
# read and predict all model to get the y 
# LogisticRegression.pkl
logReg_model = joblib.load('pklFolder/model/LogisticRegression.pkl')

# svc_classifier.pkl
svc_model = joblib.load('pklFolder/model/svc_classifier.pkl')

# RandomForestClassifier.pkl
rfc_model = joblib.load('pklFolder/model/RandomForestClassifier.pkl')

# knn.pkl
knn_model = joblib.load('pklFolder/model/knn.pkl')

# xgboost.pkl 
xgb_model = joblib.load('pklFolder/model/xgboost.pkl')

modelList = [logReg_model, svc_model, rfc_model, knn_model, xgb_model]

In [None]:
modelCompare = {
    'Model': ['LogisticRegression', 'Support Vector Classification', 'Random Forest Classifaction', 'K Nearest Neighbourhood', 'XGBOOST Classification'],
    'Accuracy': [ ],
    'Precision': [ ],
    'Recall': [ ],
    'F1 Score': [ ]
}

In [None]:
# use testing dataset rescaledX_test, y_test
predict_y = logReg_model.predict(rescaledX_test)
score = print_scores(y_test, predict_y)

for model in modelList : 
    predict_y = model.predict(rescaledX_test)
    
    score = print_scores(y_test, predict_y)

    modelCompare['Accuracy'].append(score['ac'])
    modelCompare['Precision'].append(score['pr'])
    modelCompare['Recall'].append(score['rc'])
    modelCompare['F1 Score'].append(score['f1'])


results_df = pd.DataFrame(modelCompare)
results_df

## Comparing Actual and Predicted 

In [None]:
# search for a data 
clean_df.loc[30]

dataTest = clean_df.drop(columns=['A16']).loc[30].values
y = clean_df['A16'].loc[30]

print(f"X feature > {dataTest}")
print(f"Y predict > {y}")

In [None]:
# logReg_model.predict([dataTest])

In [None]:
for i in range(len(modelList)) : 
    # loop the modelList
    result = modelList[i].predict([dataTest])

    print(f"{modelCompare['Model'][i]} > {result} ( {'false' if result != y else 'true'} )")

-------------------------------------------------------------------------
Deployment Start

# End-to-End Pipeline Construction ( Pipeline )

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

bestmodel = 'xgboost'

In [None]:
# due to the fitting problem cannot be skip, at here will create a custom pipeline by accessing 3 value as parameter 
class ModelPipeline:
    
    def __init__(self, encoder_path, scaler_path, model_path):
        # load the encoder, scaler and model by using joblib
        self.encoder = joblib.load(encoder_path)
        self.scaler = joblib.load(scaler_path)
        self.model = joblib.load(model_path)
    
    def predict(self, input_data):
        # due to input data is in dictionary type so need to do convert
        df = pd.DataFrame([input_data])
        
        # get the encode feature from encoder, to ensure consistency 
        categorical_columns = self.encoder.feature_names_in_

        # col that in int or float datatype
        continuous_columns = [col for col in df.columns if col not in categorical_columns]

        # encoder 
        df[categorical_columns] = self.encoder.transform(df[categorical_columns])

        # scaler
        x = df.values
        x_transform = self.scaler.transform(x)
        
        # model
        predictedY = self.model.predict(x_transform)
        
        return int(predictedY[0])

In [None]:
# user input 
data = {
        'A2' : 30.83,
        'A3' : 0.000,
        'A4' : 'u',
        'A5' : 'g',
        'A6' : 'w',
        'A8' : 1.250,
        'A9' : 't',
        'A10' : 't',
        'A11' : 1,
        'A13' : 'g',
        'A14' : 202.0,
        'A15' : 0
    }

# create pipeline
pipeline = ModelPipeline('pklFolder/featureEncoder.pkl', 'pklFolder/scaler.pkl', f'pklFolder/model/{bestmodel}.pkl')

In [None]:
# try to predict 
predictedY = pipeline.predict(data)

# predictedY
print(f"A16 > {predictedY}")

In [None]:
# export as pkl
joblib.dump(pipeline, 'pklFolder/pipeline.pkl')

In [None]:
# try to export and use 
ePipeline = joblib.load('pklFolder/pipeline.pkl')

print(f"A16 > {ePipeline.predict(data)}")

# Testing the process of predict for UI

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Example DataFrame
# data = dict( A2=[30.83], 
#              A3=[0.000],  
#              A4=['u'], 
#              A5=['g'], 
#              A6=['w'],
#              A8=[1.250],
#              A9=['t'],
#              A10=['t'],
#              A11=[1],
#              A13=['g'],
#              A14=[202.0],
#              A15=[0]
#            )

data = {
        'A2' : 30.83,
        'A3' : 0.000,
        'A4' : 'u',
        'A5' : 'g',
        'A6' : 'w',
        'A8' : 1.250,
        'A9' : 't',
        'A10' : 't',
        'A11' : 1,
        'A13' : 'g',
        'A14' : 202.0,
        'A15' : 0
    }
# y = + (0)
# b	30.83	0.000	u	g	w	v	1.250	t	t	1	f	g	202.0	0	+


df = pd.DataFrame([data])

df

In [None]:
encoder = joblib.load('pklFolder/featureEncoder.pkl')
scaler = joblib.load('pklFolder/scaler.pkl')
model = joblib.load('pklFolder/model/LogisticRegression.pkl')

In [None]:
print(encoder.feature_names_in_)

In [None]:
categorical_columns = encoder.feature_names_in_

df[categorical_columns] = encoder.transform(df[categorical_columns])

In [None]:
df

In [None]:
x = df.values

x

In [None]:
x_transform = scaler.transform(x)

x_transform

In [None]:
predictedY = model.predict(x_transform)

print(f"{int(predictedY[0])}")