#           HANDWRITTEN CHARACTER RECOGNITION PROJECT

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Creating Dataset

In [None]:
# Dictionary for defining characters into numeric labels as machine can work on numeric data
classes = {}

# Adding digits to classes with their respective labels
for i in range(48, 58):
    classes[chr(i)] = i-48
    
# Adding Upper Case English Alphabets to classes with their respective labels
for i in range(65, 91):
    classes[chr(i)] = i-55

# Adding Lower Case English Alphabets to classes with their respective labels
for i in range(97, 123):
    classes[chr(i)] = i-61

# Printing all labels with their respective character
print(classes)

In [None]:
# Importing os library for storing accessing directory of dataset images
import os

# Image dataset folder path
path = 'D:/Minor Project/Dataset/Characters'

# Storing dataset images name in images list
characters = os.listdir(path)

In [None]:
# Importing cv2 for reading pixel values of images
import cv2

#Defining lists for storing

# list CH for storing array of character image pixel values
CH = []
# list LABEL for storing target label for their corresponding image values
LABEL = []

# Loop for every image in images list
for file in characters:
    images = os.listdir(path+'/'+file+'/')
    for img in images:
        # Adding array of image pixel values after resizing resolution as height 32 * width 32
        CH.append(cv2.resize(cv2.imread(path+'/'+file+'/'+img, 0), (32, 32)))
        # Adding respective label for images
        LABEL.append(classes[chr(int(file))])

Dataset CREATED!

In [None]:
# Checking no. of images for every character
charImages = pd.Series(LABEL).value_counts()

In [None]:
print(charImages)

In [None]:
#LABELDATA = pd.DataFrame(LABEL).replace([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61],['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])

In [None]:
# Checking size of character images
CH[0].shape

We have images of size 32*32 pixels

In [None]:
# Converting python lists to numpy arrays for better usage
CH = np.array(CH)
ORIGINAL = CH
LABEL = np.array(LABEL)

Lets check a sample with its label.

In [None]:
plt.imshow(CH[111], cmap='gray')
print('Label : ', LABEL[111])

## Preparing Data

In [None]:
# Checking shape of CH array
print('Shape of CH array : ', CH.shape)

Size of our character array is 3-dimensional.
But ML models work on two-dimensional data.
So lets reduce array size to 2-dimensional.

In [None]:
# Converting 3D array to 2D array
CH = CH.reshape(len(CH),-1)

In [None]:
# Again checking shape of CH array
print('Shape of CH array : ', CH.shape)

Now it is 2D.

## Splitting data into training and testing

In [None]:
chtrain, chtest, labeltrain, labeltest = train_test_split(CH, LABEL, test_size = 0.3, random_state = 10)

In [None]:
# SHAPE OF TRAINING AND TESTING DATA
print('Training Data Shape : ', '\n\tCharacter Array Shape : ', chtrain.shape, '\n\tLabel Array Shape :\t', labeltrain.shape)
print()
print('Testing Data Shape : ', '\n\tCharacter Array Shape : ', chtest.shape, '\n\tLabel Array Shape :\t', labeltest.shape)

## Feature Scaling

In [None]:
# Finding maximum value in character data
print(chtrain.max())

In [None]:
# Scaling data according to maximum value
chtrain = chtrain/255
chtest = chtest/255

In [None]:
# Now checking maximum value
print(chtrain.max())

As we can see, image pixel values have reduced to maximum value 1. Since values are divided therefore values have converted to float.

## Feature Selection using PCA (Principal Component Analysis)

In [None]:
# Importing PCA from sci-kit learn library
from sklearn.decomposition import PCA

In [None]:
# Shape of image data before applying PCA
print('Image Training Data Shape :\t', chtrain.shape)
print('Image Testing Data Shape :\t', chtest.shape)

In [None]:
# Defining PCA for 96% details
pca = PCA(0.96)

# Transforming data according to PCA
# fit_transform for learning parameters from training data and transforming training data accordingly
# combination of fit() and transform()
chtrain = pca.fit_transform(chtrain)
# transforming testing data according to parameters learned from training data
chtest = pca.transform(chtest)

In [None]:
# Lets check pca components and features
print('PCA Components :', pca.n_components)
print('PCA Features :\t', pca.n_features_)

In [None]:
# Shape of image data after applying PCA
print('Image Training Data Shape :\t', chtrain.shape)
print('Image Testing Data Shape :\t', chtest.shape)

## Decoding Class

In [None]:
# Dictionary for defining characters into numeric labels as machine can work on numeric data
decode = {}

# Adding digits to classes with their respective labels
for i in range(48, 58):
    decode[i-48] = chr(i)
    
# Adding Upper Case English Alphabets to classes with their respective labels
for i in range(65, 91):
    decode[i-55] = chr(i)

# Adding Lower Case English Alphabets to classes with their respective labels
for i in range(97, 123):
    decode[i-61] = chr(i)

# Printing all labels with their respective character
print(decode)

## Copying and Merging training and testing data for calculating overall accuracy

In [None]:
chardata = np.concatenate((chtrain, chtest))
labeldata = np.concatenate((labeltrain, labeltest))

# APPLYING MODELS

In [None]:
import joblib

## 1. Logistic Regression

In [None]:
# Importing Logistic Regression Model
from sklearn.linear_model import LogisticRegression

### i. Fitting Model

In [None]:
# Creating an object of model
#log_model = LogisticRegression(solver='saga', max_iter=5000)
log_model = joblib.load('Log_model.joblib')

# Fitting model on training data
#log_model.fit(chtrain, labeltrain)

### ii. Predicting data

In [None]:
# Predicting training data
labeltrain_pred = log_model.predict(chtrain)

# Predicting testing data
labeltest_pred = log_model.predict(chtest)

### iii. Evaluation

In [None]:
# Evaluating Training Accuracy
print('Training Accuracy =\t', accuracy_score(labeltrain, labeltrain_pred))

# Evaluating Testing Accuracy
print('Testing Accuracy =\t', accuracy_score(labeltest, labeltest_pred))

## 2. Random Forest

In [None]:
# Importing Random Forest Model
from sklearn.ensemble import RandomForestClassifier

### i. Fitting Model

In [None]:
random_forest = RandomForestClassifier(n_estimators=60, random_state=42)
random_forest.fit(chtrain, labeltrain)

### ii. Predicting data

In [None]:
# Predicting training data
labeltrainpred = random_forest.predict(chtrain)

# Predicting testing data
labeltestpred = random_forest.predict(chtest)

### iii. Evaluation

In [None]:
# Evaluating Training Accuracy
print('Random Forest Training Accuracy Score:\t',random_forest.score(chtrain, labeltrain))

# Evaluating Testing Accuracy
print('Random Forest Testing Accuracy Score:\t',random_forest.score(chtest, labeltest))

# MODELS COMPARISON

In [None]:
# Evaluating Logistic Regression Accuracy
print('Logistic Regression Overall Accuracy =\t', log_model.score(chardata, labeldata))

# Evaluating Random Forest Accuracy
print('Random Forest Overall Accuracy =\t', random_forest.score(chardata, labeldata))

Here, we can clearly see that RANDOM FOREST is far BETTER than LOGISTIC REGRESSION.
Because we have 62 classes which can be better handled using RANDOM FOREST.

# PREDICT LABEL OF TEST IMAGES USING MODEL

In [None]:
plt.rcParams['figure.figsize'] = [2, 2]

In [None]:
os.listdir('D:/Minor Project/Dataset/Test')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img001-054.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : 0')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img004-041.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : 3')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img008-048.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : 7')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img011-024.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : A')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img014-012.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : D')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img015-017.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : E')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img016-049.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : F')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img019-004.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : I')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img019-005.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : I')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img022-011.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : L')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img026-038.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : P')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img027-055.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : Q')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img030-031.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : T')
plt.imshow(testimage, cmap='gray')

In [None]:
testimage = cv2.resize(cv2.imread('D:/Minor Project/Dataset/Test/img032-039.png', 0), (32, 32))
testSample = pca.transform([(testimage/255).flatten()])
plt.title('Predicted : '+decode[int(random_forest.predict(testSample))]+'\nActual : V')
plt.imshow(testimage, cmap='gray')

In [None]:
labeldatapred = random_forest.predict(chardata)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(labeldata, labeldatapred))

In [None]:
import joblib

In [None]:
file = "RF_model.joblib"
joblib.dump(random_forest, file)
file1 = 'Log_model.joblib'
joblib.dump(log_model, file1)

In [None]:
loaded_model = joblib.load('RF_model.joblib')
result = loaded_model.score(chardata, labeldata)
print(result)