In [4]:
import os
import cv2
import numpy as np
import dask
import dask.array as da
import pandas as pd

In [2]:
# Define the directory path containing sub-folders of images
direc_path = r'/home/sysadm/z/ML_P1/Ekush_Dataset/'


Reading the whole Ekush dataset in numpy and then saving it in .npy format.

Do not run unnecessarily access the whole ekush npy file using np.load()

Ekush npy file is saved in:'/home/sysadm/z/ML_P1/Data_npy_files/ekush_data.npy'

In [3]:
# Function to load and preprocess an image
def load_and_preprocess_image(sub_folder, img_name):
    img_path = os.path.join(direc_path, sub_folder, img_name)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Read image in grayscale
    img = cv2.resize(img, (28, 28))  # Resize image to a uniform size (e.g., 28x28)
    img = img.flatten() / 255.0  # Flatten pixel values and normalize to range [0, 1]
    label = int(sub_folder)  # Get the label from the sub-folder name
    return img, label

# Function to process images in a sub-folder
def process_sub_folder(sub_folder):
    sub_path = os.path.join(direc_path, sub_folder)
    images = []
    labels = []

    # Loop through each image in the sub-folder
    for img_name in os.listdir(sub_path):
        img, label = load_and_preprocess_image(sub_folder, img_name)
        images.append(img)
        labels.append(label)  # Store label for the image
        
    return images, labels

# List sub-folders (classes)
sub_folders = os.listdir(direc_path)

# Sort sub-folders numerically
sub_folders.sort(key=lambda x: int(x))

# Create Dask delayed objects for image processing
delayed_results = [dask.delayed(process_sub_folder)(sub_folder) for sub_folder in sub_folders]

# Compute delayed objects in parallel
results = dask.compute(*delayed_results)


In [4]:

# Flatten the list of lists of images and labels
images_flat = [img for sublist in results for img in sublist[0]]
labels_flat = [label for sublist in results for label in sublist[1]]

# Convert lists to NumPy arrays
images_array = np.array(images_flat)
labels_array = np.array(labels_flat)

# Save NumPy arrays to disk
#np.save('images.npy', images_array)
#np.save('labels.npy', labels_array)


#Merging te images and labels.
whole_data=np.hstack([images_array,labels_array.reshape((labels_array.shape[0],1))])
whole_data[:,-1]

#Saving the whole Ekush dataset in .npy format
np.save('/home/sysadm/z/ML_P1/Data_npy_files/ekush_data.npy', whole_data)

In [6]:
whole_data.shape
whole_data[:,-1]

array([ 1.,  1.,  1., ..., 50., 50., 50.])

Loading Whole Ekush data in npy format.

In [5]:
ekush_data_path_np='/home/sysadm/z/ML_P1/Data_npy_files/ekush_data.npy'
ekush=np.load(ekush_data_path_np)

In [3]:
ekush.astype('float16')

array([[3.9215e-03, 0.0000e+00, 3.9215e-03, ..., 3.9215e-03, 1.1765e-02,
        1.0000e+00],
       [7.8430e-03, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e+01]], dtype=float16)

In [6]:
ekush.shape

(149341, 785)

Training 80% of ekush using naive bayes.

In [10]:
#Importing the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score


In [8]:
X_train,X_test,y_train,y_test=train_test_split(ekush[:,:784],ekush[:,-1],test_size=0.2,random_state=42)

print(X_train.shape)
print(X_test.shape)

(119472, 784)
(29869, 784)


In [11]:
bayes_classifier=MultinomialNB(alpha=30)

bayes_classifier.fit(X_train,y_train)

In [12]:
bayes_prediction=bayes_classifier.predict(X_test)

bayes_accuracy=accuracy_score(y_test,bayes_prediction)

print("Bayes_accuracy:",bayes_accuracy)

Bayes_accuracy: 0.39703371388395997


Bayes_accuracy: 0.24912116240918677

Training with SVM nonlinear kernels:Here we use RBF(Radial Basis Function)

SVM with kernel=rbf took 67 mins 47 secs to fit on 80% of Ekush dataset.

In [15]:
svm_rbf_classifier=SVC(kernel='rbf')
svm_rbf_classifier.fit(X_train,y_train)

SVM with RBF kernel took 26mins 53.6 sec to predict on 20% of ekush dataset.

It gave an accuracy of :SVM RBF Accuracy: 0.7816800026783621

In [16]:
svm_rbf_prediction=svm_rbf_classifier.predict(X_test)
svm_rbf_accuracy=accuracy_score(y_test,svm_rbf_prediction)

print("SVM RBF Accuracy:",svm_rbf_accuracy)

SVM RBF Accuracy: 0.7816800026783621


Now we will be applying SVM(kernal=RBF) on the whole Ekush data as training and then test on the whole Bangla Isolated

In [17]:
ekush.shape

(149341, 785)

In [25]:
#Loading bangla isolated
#Loading of ekush is already done
bangla_isolated_path=r'/home/sysadm/z/ML_P1/Data_npy_files/bangla_lekha_data.npy'
bangla_isolated=np.load(bangla_isolated_path).astype('float16')
bangla_isolated.shape

(98950, 785)

In [27]:
X_train,y_train=ekush[:,:784],ekush[:,-1]
X_test,y_test=bangla_isolated[:,:784],bangla_isolated[:,-1]
print(f'Training data set shape are: {X_train.shape} and {y_train.shape}\n Test dataset shapes are: {X_test.shape} and {y_test.shape} ')

Training data set shape are: (149341, 784) and (149341,)
 Test dataset shapes are: (98950, 784) and (98950,) 


Time taken to fit SVC(rbf) in whole ekush dataset is 84m

In [11]:
svm_rbf_classifier=SVC(kernel='rbf')                               #training on ekush
svm_rbf_classifier.fit(X_train,y_train)
###################################################################
svm_rbf_prediction=svm_rbf_classifier.predict(X_test)              #testing on bangla lekha
svm_rbf_accuracy=accuracy_score(y_test,svm_rbf_prediction)
print(f"svm_rbf_accuracy only on bangla_lekha data set is: {svm_rbf_accuracy}")

svm_rbf_accuracy only on bangla_lekha data set is: 0.6892572006063669


In [12]:

cf=confusion_matrix(y_test,svm_rbf_prediction)           #Confusion matrix creation for svm_rbf
df_cf=pd.DataFrame(cf)
df_cf.to_csv('/home/sysadm/z/ML_P1/Confusion_matrix_folder/svm_rbf_conf_mat.csv')

#svm_rbf_f1=f1_score(y_test,svm_rbf_prediction)   #f1 score
#sbm_rbf_precision=(y_test,svm_rbf_prediction)     #precision score

In [13]:
#Confusion matrix for 
print(cf)

[[1386  150    0 ...    1    0    8]
 [  93 1544    0 ...    1    0    3]
 [   0    1 1274 ...   18    0   10]
 ...
 [   0    0   11 ... 1465   29   33]
 [   0    0    0 ...   47 1759   23]
 [   2    3    1 ...   13    4 1734]]


#Setting the hyperparameter c values in svc with 

In [23]:
X_train,X_valid,y_train,y_valid=train_test_split(ekush[:,:784],ekush[:,-1],test_size=0.2,random_state=42)
X_train.shape
X_valid.shape

(29869, 784)

SVM with kernal rbf and c=0.1 gives validation accuracy: 0.5223475844521075

SVM with kernal rbf and c=0.3 gives validation accuracy: 0.5885700893903378

SVM with kernal rbf and c=0.5 gives validation accuracy: 0.6092939167698952

SVM with kernal rbf and c=0.7 gives validation accuracy: 0.6218487394957983 

#############So we got the highest accurcay for c=1##########

In [17]:
c=[0.1,0.3,0.5,0.7]
predict_list=[]
accuracy_ls=[]
for i in c:
    svm_rbf_classifier=SVC(kernel='rbf',C=i,gamma='auto')
    svm_rbf_classifier.fit(X_train,y_train)
    #### Predicting valid


    svm_rvf_predict=svm_rbf_classifier.predict(X_valid)
    predict_list.append(svm_rvf_predict)
    svm_rbf_accuracy=accuracy_score(y_valid,svm_rvf_predict)
    accuracy_ls.append(svm_rbf_accuracy)

    print(f'SVM with kernal rbf and c={i} gives validation accuracy: {svm_rbf_accuracy}')

    

SVM with kernal rbf and c=0.1 gives validation accuracy: 0.5223475844521075
SVM with kernal rbf and c=0.3 gives validation accuracy: 0.5885700893903378
SVM with kernal rbf and c=0.5 gives validation accuracy: 0.6092939167698952
SVM with kernal rbf and c=0.7 gives validation accuracy: 0.6218487394957983


Applying logistic regression for with Hyperparameter Tuning: Experiment with different hyperparameters of the logistic regression model, such as the regularization strength (C parameter), penalty type (l1 or l2), and solver algorithm. Grid search or randomized search can help in finding the optimal set of hyperparameters.

In [18]:
#Creating a regualrization strength list.
c=[0.3,0.5,0.8,1,1.3]
#solver=['lbfgs','newton-cg']#lbfgs failed to converge so try with newton-cg only.
multiclass=['ovr','multinomial']
iterations=[200,500,800,1000]
logistic_predict_ls=[]
logistic_accuracy_ls=[]


Logistic with multiclass=ovr,regularization strength:0.3,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902

Logistic with multiclass=ovr,regularization strength:0.3,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902

Logistic with multiclass=ovr,regularization strength:0.3,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902

Logistic with multiclass=ovr,regularization strength:0.3,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902

Logistic with multiclass=ovr,regularization strength:0.5,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022

Logistic with multiclass=ovr,regularization strength:0.5,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022

Logistic with multiclass=ovr,regularization strength:0.5,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022

Logistic with multiclass=ovr,regularization strength:0.5,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022

Logistic with multiclass=ovr,regularization strength:0.8,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.5209749238340755

Logistic with multiclass=ovr,regularization strength:0.8,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.5209749238340755

Logistic with multiclass=ovr,regularization strength:0.8,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.5209749238340755

Logistic with multiclass=ovr,regularization strength:0.8,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.5209749238340755

Logistic with multiclass=ovr,regularization strength:1,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.5203722923432321

Logistic with multiclass=ovr,regularization strength:1,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.5203722923432321

Logistic with multiclass=ovr,regularization strength:1,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.5203722923432321

Logistic with multiclass=ovr,regularization strength:1,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.5203722923432321

Logistic with multiclass=ovr,regularization strength:1.3,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.5199705380160032

Logistic with multiclass=ovr,regularization strength:1.3,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.5199705380160032

Logistic with multiclass=ovr,regularization strength:1.3,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.5199705380160032

Logistic with multiclass=ovr,regularization strength:1.3,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.5199705380160032

In [22]:
#Checking with multi class=ovr

for i in multiclass:
    for strength in c:
        for j in iterations:
            logistic_clf=LogisticRegression(penalty="l2",C=strength,solver='newton-cg',max_iter=j,multi_class=i)
            logistic_clf.fit(X_train,y_train)
            #Predicting on validation set.
            predict=logistic_clf.predict(X_valid)
            logistic_predict_ls.append(predict)
            #Checking accuracy on validation set.
            accuracy=accuracy_score(y_valid,predict)
            logistic_accuracy_ls.append(accuracy)

            print(f'Logistic with multiclass={i},regularization strength:{strength},max_iter:{j},multi class(ovr) gives validation accuracy on ekush as:{accuracy}')

Logistic with multiclass=ovr,regularization strength:0.3,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902
Logistic with multiclass=ovr,regularization strength:0.3,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902
Logistic with multiclass=ovr,regularization strength:0.3,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902
Logistic with multiclass=ovr,regularization strength:0.3,max_iter:1000,multi class(ovr) gives validation accuracy on ekush as:0.5244233151427902
Logistic with multiclass=ovr,regularization strength:0.5,max_iter:200,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022
Logistic with multiclass=ovr,regularization strength:0.5,max_iter:500,multi class(ovr) gives validation accuracy on ekush as:0.52298369547022
Logistic with multiclass=ovr,regularization strength:0.5,max_iter:800,multi class(ovr) gives validation accuracy on ekush as:0.522983695470

KeyboardInterrupt: 

In [24]:
logistic_clf=LogisticRegression(penalty="l2",C=1,solver='newton-cg',max_iter=400,multi_class='multinomial')
logistic_clf.fit(X_train,y_train)
#Predicting on validation set
predict=logistic_clf.predict(X_valid)
#logistic_predict_ls.append(predict)
#Checking accuracy on validation set.
accuracy=accuracy_score(y_valid,predict)
#logistic_accuracy_ls.append(accuracy)
print(f'Logistic with multiclass=multinomial,regularization strength:1,max_iter:400,gives validation accuracy on ekush as:{accuracy}')

Logistic with multiclass=multinomial,regularization strength:1,max_iter:400,gives validation accuracy on ekush as:0.5317553316147176


In [28]:
logistic_clf=LogisticRegression(penalty="l2",C=1,solver='newton-cg',max_iter=400,multi_class='multinomial')
logistic_clf.fit(X_train,y_train)
bangla_predict=logistic_clf.predict(X_test)

accuracy=accuracy_score(y_test,bangla_predict)
print(f"Accuracy for Bangla isolated test data set by Logistic with multiclass is {accuracy}")

Accuracy for Bangla isolated test data set by Logistic with multiclass is 0.3927134916624558
