In [49]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from tune_sklearn import TuneSearchCV,TuneGridSearchCV


from data_prepare_func import convert_to_array


In [50]:
import cv2
import os
import numpy as np

def detect_and_crop_handwriting(image):
    _, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    height, width = image.shape[:2]
    center_x = width // 2
    center_y = height // 2
    max_offset = -1
    max_offset_contour = None

    for contour in contours:
        M = cv2.moments(contour)
        if M["m00"] == 0:
            cX = 0
            cY = 0
        else:
            cX = int(M["m10"] / M["m00"])
            cY = int(M["m01"] / M["m00"])

        offset = np.sqrt((center_x - cX) ** 2 + (center_y - cY) ** 2)

        if offset > max_offset:
            max_offset = offset
            max_offset_contour = contour

    if max_offset_contour is not None:
        x, y, w, h = cv2.boundingRect(max_offset_contour)

        aspect_ratio = float(w) / h

        if aspect_ratio > 1:
            y_padding = int((w - h) / 2)
            x_padding = 0
        else:
            x_padding = int((h - w) / 2)
            y_padding = 0

        x -= x_padding
        w += 2 * x_padding
        y -= y_padding
        h += 2 * y_padding

        x = max(x, 0)
        w = min(w, width)
        y = max(y, 0)
        h = min(h, height)

        cropped_image = image[y:y + h, x:x + w]

        # resized_image = cv2.resize(cropped_image, (300, 300))

        # resized_gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

        # resized_gray = resized_gray.reshape((28, 28, 1))

        return cropped_image

    else:
        print('No handwriting detected in the image.')
        return None

def convert_to_array(data_path,size):
    folders = ['0','1','2','3','4','5','6','7','8','9']
    X, y = [], []
    kernel = np.ones((5, 5), np.uint8)
    for folder in folders:
        folder_path = os.path.join(data_path, folder)
        images = os.listdir(folder_path)
        for image_name in images:
            image_path = os.path.join(folder_path, image_name)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            image = cv2.dilate(image, kernel, iterations=1)
            image = detect_and_crop_handwriting(image)
            image = cv2.resize(image, (size, size))  # Resize the image to 28x28 pixels
            X.append(image.flatten())  # Flatten the image and add it to the feature matrix
            y.append(int(folder))  # Add the corresponding label

    X_data = np.array(X)
    y_data = np.array(y)
    return X_data,y_data


# data = pickle.load(open("thainumber_{}.pkl".format(size), "rb"))
# X = data['X']
# Y = data['Y']

In [51]:
x_train,y_train = convert_to_array("data_train/",size=28)
x_test,y_test = convert_to_array('data_test/',size=28)

In [52]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [53]:
c = np.cov(x_train ,x_test )
c

array([[ 1.62124166,  0.37952436, -0.31049217, ..., -0.05845332,
        -0.05008589, -0.03133927],
       [ 0.37952436,  1.01841765, -0.10105859, ..., -0.01027454,
        -0.00817466,  0.00324883],
       [-0.31049217, -0.10105859,  1.27935792, ...,  0.002136  ,
        -0.00642512, -0.00390119],
       ...,
       [-0.05845332, -0.01027454,  0.002136  , ...,  0.26859247,
         0.04267642,  0.04830039],
       [-0.05008589, -0.00817466, -0.00642512, ...,  0.04267642,
         0.22346822,  0.05440044],
       [-0.03133927,  0.00324883, -0.00390119, ...,  0.04830039,
         0.05440044,  0.18291335]])

In [54]:
s = setup(x_train, target = y_train, session_id = 10,fold=5,preprocess=False)

In [55]:
best = compare_models()

In [56]:
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8321,0.9816,0.8321,0.8371,0.8313,0.8135,0.8142,0.084
lightgbm,Light Gradient Boosting Machine,0.8313,0.982,0.8313,0.8353,0.8292,0.8125,0.8134,0.21
rf,Random Forest Classifier,0.8305,0.9781,0.8305,0.8355,0.8279,0.8116,0.8127,0.084
lr,Logistic Regression,0.8272,0.9812,0.8272,0.8346,0.8271,0.808,0.8089,0.544
svm,SVM - Linear Kernel,0.815,0.0,0.815,0.823,0.8151,0.7944,0.7954,0.06
gbc,Gradient Boosting Classifier,0.7783,0.9731,0.7783,0.7852,0.7776,0.7537,0.7545,0.104
knn,K Neighbors Classifier,0.7229,0.9403,0.7229,0.7791,0.7164,0.6921,0.7005,0.178
nb,Naive Bayes,0.661,0.8963,0.661,0.6829,0.6388,0.6232,0.6312,0.062
dt,Decision Tree Classifier,0.6145,0.7859,0.6145,0.6296,0.6171,0.5717,0.5728,0.19
lda,Linear Discriminant Analysis,0.498,0.8489,0.498,0.5097,0.4976,0.4422,0.4432,0.058


In [57]:
best.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [58]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

#train the neural network model
model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', random_state=42 , alpha=0.01)
model.fit(x_train, y_train)

train_yhat = model.predict(x_train)
train_score = classification_report(y_train, train_yhat)
print("Train Score:\n", train_score)

# Test Score
test_yhat = model.predict(x_test)
test_score = classification_report(y_test, test_yhat)
print("Test Score:\n", test_score)

# F1-Score
train_f1 = f1_score(y_train, train_yhat, average='weighted')
print(f"AVG F1-Score Train: {train_f1}")

#predictions
y_pred = model.predict(x_test)


print("Accuracy_nn")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Train Score:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       173
           1       1.00      1.00      1.00       175
           2       1.00      1.00      1.00       177
           3       1.00      1.00      1.00       178
           4       1.00      1.00      1.00       174
           5       1.00      1.00      1.00       175
           6       1.00      1.00      1.00       174
           7       1.00      1.00      1.00       175
           8       1.00      1.00      1.00       174
           9       1.00      1.00      1.00       178

    accuracy                           1.00      1753
   macro avg       1.00      1.00      1.00      1753
weighted avg       1.00      1.00      1.00      1753

Test Score:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        91
           1       0.94      0.66      0.77        90
           2       0.81      0.61      0.70        

In [59]:
# Train a neural network model
nn_model = create_model('mlp')

# Tune the neural network model
tuned_nn = tune_model(nn_model)

# Evaluate the tuned model
evaluate_model(tuned_nn)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8496,0.9881,0.8496,0.8475,0.8469,0.8329,0.8332
1,0.8577,0.9846,0.8577,0.8619,0.8591,0.8419,0.8421
2,0.8327,0.9795,0.8327,0.8468,0.8302,0.8141,0.8161
3,0.8367,0.9779,0.8367,0.8427,0.8336,0.8186,0.8199
4,0.8612,0.9835,0.8612,0.8645,0.8602,0.8458,0.8464
Mean,0.8476,0.9827,0.8476,0.8527,0.846,0.8306,0.8315
Std,0.0113,0.0036,0.0113,0.0088,0.0125,0.0125,0.0119


Fitting 5 folds for each of 10 candidates, totalling 50 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [65]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neural_network import MLPClassifier
from tune_sklearn import TuneSearchCV


# Split the data into training and testing sets
#x_train, x_test, y_train, y_test = train_test_split(y_train, test_size=0.3, random_state=42,stratify=y_train)


# parameter
param_grid = {
   'hidden_layer_sizes': [(20,), (50,), (100,)],
   'activation': ['relu', 'tanh'],
   'solver': ['adam', 'sgd'],
}

# Create the scikit-learn model
model = MLPClassifier(random_state=42)

# Perform hyperparameter tuning using TuneSearchCV
tuned = GridSearchCV(model, param_grid,cv=5,n_jobs=-1,scoring='accuracy')

# Fit the tuned model on the training data
tuned.fit(x_train, y_train)

# Evaluate the tuned model on the testing data
test_score = tuned.score(x_test, y_test)
print("Test Score:", test_score)




Test Score: 0.6095132743362832


In [61]:
print(tuned.best_params_)

{'activation': 'relu', 'hidden_layer_sizes': (200,), 'solver': 'adam'}


In [62]:
tuned.best_score_

0.863106227106227