# DRY BEAN CLASS PREDICTION

importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ml specific imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# algorithm
# linear classfication
from sklearn.linear_model import LogisticRegression
# tree classfication
from sklearn.tree import DecisionTreeClassifier
# neighbour classification
from sklearn.neighbors import KNeighborsClassifier
# naive bayes classification
from sklearn.naive_bayes import GaussianNB
# support vection machine classification
from sklearn.svm import SVC
# ensemble
from sklearn.ensemble import RandomForestClassifier

load the dataset and explore it

In [None]:
df = pd.read_csv('../data/Dry_Bean_Dataset.csv')
df.head(3)

In [None]:
df.info()

In [None]:
df.isnull().sum() 

make the target column as numerical

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
y = enc.fit_transform(df['Class'])

get all the features

In [None]:
X = df.drop('Class', axis=1) # features

pipelines for prediction

In [None]:
X.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
# 1. Logistic Regression
k = 10
clf1 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', LogisticRegression(solver='saga'))
])
# 2. Decision Tree
clf2 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', DecisionTreeClassifier())
])
# 3. Nearest Neighbors
clf3 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', KNeighborsClassifier())
])
# 4. Gaussian Naive Bayes
clf4 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', GaussianNB())
])

# 5. Support Vector Machine
clf5 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', SVC())
])

# 6. Random Forest
clf6 = Pipeline([
        ('feature_selection', SelectKBest(chi2, k=k)),
        ('classification', RandomForestClassifier())
])

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)
# train the model
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)
clf4.fit(X_train, y_train)
clf5.fit(X_train, y_train)
clf6.fit(X_train, y_train)

In [None]:
ypred1 = clf1.predict(X_test)
ypred2 = clf2.predict(X_test)
ypred3 = clf3.predict(X_test)
ypred4 = clf4.predict(X_test)
ypred5 = clf5.predict(X_test)
ypred6 = clf6.predict(X_test)

In [None]:
fig,ax = plt.subplots(figsize=(7,7))
ConfusionMatrixDisplay(
    confusion_matrix(y_test, ypred1), 
    display_labels=enc.classes_
).plot(ax=ax, colorbar=False)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
print("LOGISTIC REGRESSION")
print('---'*20)
print(classification_report(y_test, ypred1, target_names=enc.classes_))

In [None]:
print("DECISION TREE CLF")
print('---'*20)
print(classification_report(y_test, ypred2, target_names=enc.classes_))

In [None]:
print("SUPPORT VECTOR CLF")
print('---'*20)
print(classification_report(y_test, ypred5, target_names=enc.classes_))

In [None]:
fig,ax = plt.subplots(figsize=(7,7))
ConfusionMatrixDisplay(
    confusion_matrix(y_test, ypred6), 
    display_labels=enc.classes_
).plot(ax=ax, colorbar=False)

In [None]:
# save the model, encoder
import joblib
joblib.dump(clf6, 'saved_model.pkl')
joblib.dump(enc, 'saved_encoder.pkl')

10 march 2024

In [None]:
import joblib
import gradio as gr

In [None]:
def predict_input(
    area, perimeter, majoraxislength,
    minoraxislength, aspectration, eccentricity,
    convexarea, equivdiameter, extent, solidity, 
    roundness, compactness, shapefactor1,
    shapefactor2, shapefactor3, shapefactor4
):
    data = {'Area': area,
        'Perimeter': perimeter,
        'MajorAxisLength': majoraxislength,
        'MinorAxisLength': minoraxislength,
        'AspectRation': aspectration,
        'Eccentricity': eccentricity,
        'ConvexArea': convexarea,
        'EquivDiameter': equivdiameter,
        'Extent': extent,
        'Solidity': solidity,
        'roundness': roundness,
        'Compactness': compactness,
        'ShapeFactor1': shapefactor1,
        'ShapeFactor2': shapefactor2,
        'ShapeFactor3':shapefactor3,
        'ShapeFactor4': shapefactor4
    }
    X_inp = pd.DataFrame([data])            # create a dataframe
    clf = joblib.load('saved_model.pkl')    # load the model
    enc = joblib.load('saved_encoder.pkl')  # load the encoder
    y_pred = clf.predict(X_inp)             # predict the class
    # print(y_pred, enc.inverse_transform(y_pred)[0])
    return enc.inverse_transform(y_pred)[0] # bean class name

trick to get the list of all parameters for the function when you have a lot of parameters

In [None]:
print(", ".join(df.columns.tolist()).lower())

In [None]:
# test the function X.iloc[0].todict()
data = {'Area': 28395.0,
 'Perimeter': 1000.291,
 'MajorAxisLength': 608.1781167,
 'MinorAxisLength': 173.888747,
 'AspectRation': 1.197191424,
 'Eccentricity': 0.549812187,
 'ConvexArea': 28715.0,
 'EquivDiameter': 190.1410973,
 'Extent': 0.763922518,
 'Solidity': 0.988855999,
 'roundness': 0.858027126,
 'Compactness': 0.913357755,
 'ShapeFactor1': 0.007331506,
 'ShapeFactor2': 0.003147289,
 'ShapeFactor3': 0.834222388,
 'ShapeFactor4': 0.998723889
}


# simplest version for Gradio ui

In [None]:
ui = gr.Interface(
    predict_input,
    inputs = [
        gr.Number(label='Area'),
        gr.Number(label='Perimeter', step=.01),
        gr.Number(label='MajorAxisLength', step=.01),
        gr.Number(label='MinorAxisLength', step=.01),
        gr.Number(label='AspectRation', step=.01),
        gr.Number(label='Eccentricity', step=.01),
        gr.Number(label='ConvexArea'),
        gr.Number(label='EquivDiameter', step=.01),
        gr.Number(label='Extent', step=.01),
        gr.Number(label='Solidity', step=.01),
        gr.Number(label='roundness', step=.01),
        gr.Number(label='Compactness', step=.01),
        gr.Number(label='ShapeFactor1', step=.01),
        gr.Number(label='ShapeFactor2', step=.01),
        gr.Number(label='ShapeFactor3', step=.01),
        gr.Number(label='ShapeFactor4', step=.01)
    ],
    outputs = 'text',
    title = "Dry Bean Classification",
    examples=[
        [28395.0, 1000.291, 608.1781167, 173.888747, 1.197191424, 0.549812187, 28715.0, 190.1410973, 0.763922518, 0.988855999, 0.858027126, 0.913357755, 0.007331506, 0.003147289, 0.834222388, 0.998723889]
    ]
)
ui.launch()