# Baybayin Dataset Processing

After data gathering, there are multiple steps we need to take to make the dataset ready for machine learning.

## Importaning Libraries

In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import os
import cv2

## Resizing Images

In [2]:
# This code converts the sample photos to 28x28
# Sample photos are saved in different folders

input_folder = "Baybayin Output"
output_folder = "Handwritten Baybayin Symbols Dataset/folder2"
target_size = (28, 28)

# Loop through each folder in the input folder
for folder_name in os.listdir(input_folder):
    if not os.path.isdir(os.path.join(input_folder, folder_name)):
        continue
    
    # Create the output folder if it doesn't exist
    output_folder_path = os.path.join(output_folder, folder_name)
    os.makedirs(output_folder_path, exist_ok=True)
    
    # Loop through each image in the folder
    for filename in os.listdir(os.path.join(input_folder, folder_name)):
        if not filename.endswith(".jpg") and not filename.endswith(".png"):
            continue
        
        # Load the image and resize it
        image_path = os.path.join(input_folder, folder_name, filename)
        image = Image.open(image_path)
        resized_image = image.resize(target_size)
        
        # Save the resized image to the output folder
        output_path = os.path.join(output_folder_path, filename)
        resized_image.save(output_path)


In [3]:
# This code goes through each letter's folders and lists down the letters and assign numerical values for the dataset

path = "Handwritten Baybayin Symbols Dataset/folder2/"
files = os.listdir(path)
print(files)

classes = {'a':0, 'ba':1, 'ka':2, 'da-ra':3, 'e-i':4, 'ga':5, 'ha':6, 'la':7, 'ma':8, 'na':9, 'nga':10, 'o-u':11, 
           'pa':12, 'sa':13, 'ta':14, 'wa':15, 'ya':16}    

['e-i', 'o-u', 'ga', 'ma', 'na', 'a', 'ya', 'sa', 'pa', 'ba', 'ha', 'ka', 'nga', 'da-ra', 'la', 'ta', 'wa']


In [5]:
# Creates the X and y dataset from classes list

X = []
y = []

for cl in classes:
    pth = path + cl
    for img_name in os.listdir(pth):
        img = cv2.imread(pth + "/" + img_name,0)
        X.append(img)
        y.append(classes[cl])
print("dataset created successfully")

dataset created successfully


In [7]:
# Converts them into a numpy array
print(type(X))
X = np.array(X)
y = np.array(y)
print(type(X))

<class 'list'>
<class 'numpy.ndarray'>


In [8]:
X.shape

(17000, 28, 28)

In [9]:
# Converts X to 2 dimension
X = X.reshape(len(X), -1)
print(X.shape)
print(y.shape)

(17000, 784)
(17000,)


In [10]:
# Confirms the dimension 
print(X.shape)
print(X.ndim)

(17000, 784)
2


In [11]:
# Saves X and y csv with header starting from 1
header_row = ",".join([str(i+1) for i in range(X.shape[1])])
np.savetxt("X.csv", X, delimiter=",", header=header_row, comments="")
np.savetxt("y.csv", y, delimiter=",", header="target", comments="")

In [13]:
# Load csv
X = pd.read_csv('X.csv', dtype=int)
y = pd.read_csv('y.csv', dtype=int)

In [14]:
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,775,776,777,778,779,780,781,782,783,784
0,252,255,250,255,255,250,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
1,254,255,249,248,255,251,255,252,248,255,...,255,255,255,255,255,255,255,255,255,255
2,250,255,254,243,251,255,247,255,255,85,...,255,255,255,255,255,255,255,255,255,255
3,249,255,245,246,255,245,252,254,255,255,...,255,255,255,255,255,255,255,255,255,255
4,255,252,240,255,136,1,0,202,255,243,...,255,255,255,255,255,255,255,255,255,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,255,255,241,247,255,244,247,255,255,255,...,255,255,255,255,255,255,255,255,255,255
16996,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
16997,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
16998,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255


In [15]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
16995,16
16996,16
16997,16
16998,16


In [16]:
# This code goes through the folders and gets the file names and adds them to the ID column 
# This also merges the X and y dataset

# Step 1: Create an empty list to store the file names of each sample image
file_names = []

# Step 2: Loop through each folder in your dataset and extract the file names of each sample image, and append them to the list created in step 1
root_dir = "Handwritten Baybayin Symbols Dataset/folder2/"
for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".jpg"): # assuming the images are in JPEG format
                file_path = os.path.join(folder_path, file_name)
                with Image.open(file_path) as img:
                    img_id = os.path.basename(file_path)[:-4] # remove the ".jpg" extension and the folder name
                file_names.append(img_id)

# Step 3: Create a DataFrame from the list of file names with the "ID" column as the first column
column_names = ["ID"] + [str(i) for i in range(1, 785)] # assuming 784 features in X
df = pd.DataFrame(columns=column_names)
df["ID"] = file_names

# Step 4: Load your existing X and y CSV files into pandas DataFrames
X = pd.read_csv("X.csv")
y = pd.read_csv("y.csv")

# Step 5: Merge the "ID" column with your X DataFrame
X = pd.concat([df["ID"], X], axis=1)

# Step 6: Merge the X and y DataFrames into a single DataFrame
df = pd.concat([X, y], axis=1)

# Step 7: Save the updated DataFrame to a new CSV file
df.to_csv("Baybayin Dataset.csv", index=False)


In [55]:
# Loads the dataset, changes the datatypes to integet, and used the ID column as index
df4 = pd.read_csv('Baybayin Dataset.csv', dtype=int, index_col='ID')

In [56]:
df4

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,target
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5109,252,255,250,255,255,250,255,255,255,255,...,255,255,255,255,255,255,255,255,255,0
5135,254,255,249,248,255,251,255,252,248,255,...,255,255,255,255,255,255,255,255,255,0
5653,250,255,254,243,251,255,247,255,255,85,...,255,255,255,255,255,255,255,255,255,0
5647,249,255,245,246,255,245,252,254,255,255,...,255,255,255,255,255,255,255,255,255,0
5121,255,252,240,255,136,1,0,202,255,243,...,255,255,255,255,255,255,255,255,255,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16761,255,255,241,247,255,244,247,255,255,255,...,255,255,255,255,255,255,255,255,255,16
16775,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,16
16985,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,16
16991,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,16


In [57]:
# Split the dataset into X and y
X = df4.drop(columns=["target"])
y = df4["target"]

In [58]:
# Save X to CSV file
X.to_csv("X_1.csv", index=True, header=True)

# Save y to CSV file
y.to_csv("y_1.csv", index=True, header=True)

In [63]:
from sklearn.model_selection import train_test_split

# Split X and y into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save X_test and y_test to CSV files
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv("test_1.csv", index=True, header=True)


In [59]:
X = pd.read_csv('X_1.csv', dtype=int)

In [60]:
X

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,5109,252,255,250,255,255,250,255,255,255,...,255,255,255,255,255,255,255,255,255,255
1,5135,254,255,249,248,255,251,255,252,248,...,255,255,255,255,255,255,255,255,255,255
2,5653,250,255,254,243,251,255,247,255,255,...,255,255,255,255,255,255,255,255,255,255
3,5647,249,255,245,246,255,245,252,254,255,...,255,255,255,255,255,255,255,255,255,255
4,5121,255,252,240,255,136,1,0,202,255,...,255,255,255,255,255,255,255,255,255,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,16761,255,255,241,247,255,244,247,255,255,...,255,255,255,255,255,255,255,255,255,255
16996,16775,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
16997,16985,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
16998,16991,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255


In [61]:
y = pd.read_csv('y_1.csv', dtype=int)

In [62]:
y

Unnamed: 0,ID,target
0,5109,0
1,5135,0
2,5653,0
3,5647,0
4,5121,0
...,...,...
16995,16761,16
16996,16775,16
16997,16985,16
16998,16991,16


In [81]:
# This code gets random samples from the X dataset to make a testing dataset

data = X.sample(frac=1, random_state=42)

# Split the data into training and test sets (80% for training, 20% for testing)
train_data = X.iloc[:int(0.8*len(data)), :]
test_data = X.iloc[int(0.8*len(data)):, :]

# Save the test set to a new file (test.csv)
test_data.to_csv('test_1.csv', index=False)

In [82]:
test = pd.read_csv('test_1.csv', dtype=int)

In [83]:
test

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,481,255,252,255,243,255,253,249,252,255,...,255,255,255,255,255,255,255,255,255,255
1,4723,249,249,254,255,247,255,247,254,241,...,247,217,48,24,76,250,255,250,255,242
2,4737,255,235,255,250,244,255,247,255,247,...,242,5,95,68,77,83,226,255,245,252
3,4904,250,255,254,255,248,241,255,251,255,...,254,255,247,255,255,229,239,255,253,247
4,456,252,255,255,249,255,254,252,255,255,...,4,0,49,117,248,247,246,234,255,255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395,16761,255,255,241,247,255,244,247,255,255,...,255,255,255,255,255,255,255,255,255,255
3396,16775,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
3397,16985,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
3398,16991,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
