# KaggleDownload

In [74]:
import kagglehub

path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tmdb/tmdb-movie-metadata?dataset_version_number=2...


100%|██████████| 8.89M/8.89M [00:01<00:00, 7.33MB/s]

Extracting files...





Path to dataset files: C:\Users\EnderEfe\.cache\kagglehub\datasets\tmdb\tmdb-movie-metadata\versions\2


# Custom Dataset Processing

## Importing Required Libraries and Definings

In [None]:
import os
import cv2
import numpy as np

dataset_path = r"E:\\Python_Projeler\\ComputerVisionProjects\\FinalProject\\dataset"

## Character Mapping

In [None]:
letter_dict = {
    '0001': 'a', '0002': 'b', '0003': 'c', '0004': 'd', '0005': 'e', '0006': 'f',
    '0007': 'h', '0008': 'vertical_line', '0009': 'j', '0010': 'k', '0011': 'vertical_line',
    '0012': 'm', '0013': 'n', '0014': 'o', '0015': 'p', '0016': 'q', '0017': 'r',
    '0018': 's', '0019': 't', '0020': 'u', '0021': 'v', '0022': 'w', '0023': 'x',
    '0024': 'y', '0025': 'z', '0026': '0', '0027': '1', '0028': '2', '0029': '3',
    '0030': '4', '0031': '5', '0032': '6', '0033': '7', '0034': '8', '0035': '9',
    '0036': 'plus', '0037': 'horizontal_line', '0038': 'slash',
    '0039': 'paranthesis_left', '0040': 'paranthesis_right', '0041': 'sqrt', '0042': 'sqrt'}

label_dict = {label: 0 for label in set(letter_dict.values())}

## Character Extraction Function

In [None]:
def ExtractSet(path, save_path, name=None):
    page = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    kernel = np.ones((4, 4), np.uint8)
    blur = cv2.GaussianBlur(page, (3, 3), 1)
    _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    dilation = cv2.dilate(thresh, kernel, iterations=2)
    dilation_2 = cv2.dilate(thresh, kernel=np.ones((1, 1), np.uint8))  # Optional, may vary per dataset

    contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for index, contour in enumerate(contours):
        mask = np.zeros_like(dilation)
        cv2.drawContours(mask, [contour], -1, 255, thickness=cv2.FILLED)
        MaskedLetter = cv2.bitwise_and(dilation_2, dilation_2, mask=mask)

        if name is not None:
            index = label_dict[name]
            label_dict[name] = index + 1

        x, y, w, h = cv2.boundingRect(contour)
        max_edge = max(h, w)
        blank = np.zeros((max_edge, max_edge), np.uint8)
        

        x1, x2 = int((max_edge - h) / 2), int((max_edge + h) / 2)
        y1, y2 = int((max_edge - w) / 2), int((max_edge + w) / 2)

        blank[x1:x2, y1:y2] = MaskedLetter[y:y + h, x:x + w]
        blank = cv2.copyMakeBorder(blank, 5,5,5,5, borderType=cv2.BORDER_CONSTANT, value=0)  
        
        letter = cv2.resize(blank, (32,32))

        cv2.imwrite(save_path + f"\\{index}.jpg", letter)

## Process Dataset Image Pages

In [None]:
for dSet_folder in os.listdir(dataset_path):
    if len(dSet_folder.split(".")) > 1:
        continue

    dSet_path = os.path.join(dataset_path, dSet_folder)

    for index, letter_page in enumerate(os.listdir(dSet_path)):
        if letter_page.split(".")[-1] != "jpg":
            continue

        letter_page_path = os.path.join(dSet_path, letter_page)
        letter_number = letter_page.split(".")[0].split("-")[-1]
        letter_name = letter_dict[letter_number]

        print(f"{letter_name}        {letter_number}")

        save_path = os.path.join(dSet_path, letter_name)
        os.makedirs(save_path, exist_ok=True)

        ExtractSet(letter_page_path, save_path, letter_name)

# EMNIST Extracting

In [1]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import gzip

EMNIST_type = "balanced"

#This Mapping is for balanced ONLY!! You have to check emnist-TYPE-mapping.txt file to correct the mapping!
LabelMapping = {
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7",
    8: "8",
    9: "9",
    10: None,
    11: None,
    12: "upper_c",
    13: None,
    14: None,
    15: None,
    16: None,
    17: None,
    18: None,
    19: "upper_j",
    20: "upper_k",
    21: "upper_l",
    22: "upper_m",
    23: None,
    24: "upper_o",
    25: "upper_p",
    26: None,
    27: None,
    28: "upper_s",
    29: None,
    30: "upper_u",
    31: "upper_v",
    32: "upper_w",
    33: "upper_x",
    34: "upper_y",
    35: "upper_z",
    36: "lower_a",
    37: "lower_b",
    38: "lower_d",
    39: "lower_e",
    40: "lower_f",
    41: "lower_g",
    42: "lower_h",
    43: "lower_n",
    44: "lower_q",
    45: "lower_r",
    46: "lower_t"
}

notWanted = ['k', "j","m", "q", "z", "v","u","1","w"]

LabelCounter = {
    "0": 0,
    "1": 0,
    "2": 0,
    "3": 0,
    "4": 0,
    "5": 0,
    "6": 0,
    "7": 0,
    "9": 0,
    "8": 0,
    "z": 0,
    "a": 0,
    "b": 0,
    "c": 0,
    "d": 0,
    "e": 0,
    "f": 0,
    "g": 0,
    "h": 0,
    "i": 0,
    "j": 0,
    "k": 0,
    "l": 0,
    "m": 0,
    "n": 0,
    "o": 0,
    "p": 0,
    "q": 0,
    "r": 0,
    "s": 0,
    "t": 0,
    "u": 0,
    "v": 0,
    "w": 0,
    "x": 0,
    "y": 0,
}

In [2]:
def ReadIdxImage(file):
    with gzip.open(file,"rb") as F:
        
        magic_number = int.from_bytes(F.read(4), 'big') 
        num_items = int.from_bytes(F.read(4), 'big')  
        num_rows = int.from_bytes(F.read(4), 'big') 
        num_cols = int.from_bytes(F.read(4), 'big')  

        data = np.frombuffer(F.read(),dtype = np.uint8)
        data = data.reshape(num_items,num_rows,num_cols)

    
    return data


def ReadIdxLabel(file):
    with gzip.open(file,"rb") as F:

        F.read(8)

    
        return F.read()

In [None]:


for train_or_test in ["train", "test"]:
    
    images_path = r"EMNIST/"+EMNIST_type+r"/emnist-"+EMNIST_type+r"-"+train_or_test+"-images-idx3-ubyte.gz"
    labels_path = r"EMNIST/"+EMNIST_type+r"/emnist-"+EMNIST_type+r"-"+train_or_test+"-labels-idx1-ubyte.gz"

    save_path = r"../../dataset/EMNIST/"+EMNIST_type + r"/" +train_or_test



    os.makedirs(save_path, exist_ok=True)

    images = ReadIdxImage(images_path)

    labels = ReadIdxLabel(labels_path)

    for label in LabelMapping:
        
        char = LabelMapping[label]
        if char is None:    continue
        char = char[-1]

        if char in notWanted:    continue

        path = save_path + r"/" + char
        os.makedirs(path, exist_ok=True )
        print(f"Folder {path} is created or already exists")



    error_text = ""

    for i,(image, label) in enumerate(zip(images,labels)):
        if LabelMapping[label] is None:
            continue

        char = LabelMapping[label][-1]
        
        if char in notWanted:   
            LabelCounter[char] += 1
            continue

        char_index = LabelCounter[char]  

        try:
            save = save_path + r"/" + char +r"/"

            image = cv2.resize(image,(32,32), interpolation=cv2.INTER_NEAREST)
            cv2.imwrite(save + f"{char_index}.png",image.T)
            LabelCounter[char] += 1

        except Exception as e:
            error_text+= f"There was an error on {char} - {EMNIST_type}  -    Error: {e}\n\n"

    with open("/datasetC_error_log.txt", "a+") as F:
        F.write(error_text)

    print(f"EMNIST Dataset Extraction is completed.")
    print(LabelCounter)



Folder ../../dataset/EMNIST/balanced/train/0 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/2 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/3 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/4 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/5 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/6 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/7 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/8 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/9 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/c is created or already exists
Folder ../../dataset/EMNIST/balanced/train/l is created or already exists
Folder ../../dataset/EMNIST/balanced/train/o is created or already exists
Folder ../../dataset/EMNIST/balanced/train/p is created or already exists
Folder ../../dataset/EMNIST/balanced/t

# Operators 2800

In [25]:
import os
import cv2
import numpy as np

path = r"../../dataset/Operators/"

for folder in os.listdir(path):

    if len(os.listdir(path + folder)) <= 2800:     continue

    for image in os.listdir(path + folder)[2800:]:
        os.remove( os.path.join(path + folder, image) )
    print(f" {folder} is DONE!")


 + is DONE!
 - is DONE!
 sqrt is DONE!


# Operators Dilation

In [26]:
import os
import cv2
import numpy as np

path = r"../../dataset/Operators/"

for folder in os.listdir(path):

    if folder == "1":   continue

    
    for i, image in enumerate( os.listdir(path + folder) ):
        image_path = os.path.join(path + folder, image)
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        blur = cv2.GaussianBlur(img, (3, 3), 1)
        _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        dilation = cv2.dilate(thresh, kernel=np.ones((2,2), np.uint8))

        cv2.imwrite(image_path, dilation)

    print(f"Folder {folder} is done")

    
cv2.destroyAllWindows()

Folder ( is done
Folder ) is done
Folder + is done
Folder - is done
Folder slash is done
Folder sqrt is done
Folder [ is done
Folder ] is done


# Operators Resize

In [8]:
path = r"../../dataset/Operators/"
save_path = r"../../dataset/Operators_Resized/"

for folder in os.listdir(path):
    os.makedirs(save_path + folder,exist_ok=True)
    
    for i, image in enumerate( os.listdir(path + folder) ):

        image_path = os.path.join(path + folder, image)
        image_save_path = os.path.join(save_path + folder, image)

        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.copyMakeBorder(img, 5,5,5,5, borderType=cv2.BORDER_CONSTANT, value=0)  

        img = cv2.resize(img, (32,32), interpolation=cv2.INTER_NEAREST)

        
        cv2.imwrite(image_save_path, img)

    print(f"Folder {folder} is done")


Folder ( is done
Folder ) is done
Folder + is done
Folder - is done
Folder slash is done
Folder sqrt is done
Folder [ is done
Folder ] is done


# Operators Blur

In [34]:
path = r"../../dataset/Operators/"

for folder in os.listdir(path):
    
    for i, image in enumerate( os.listdir(path + folder) ):

        image_path = os.path.join(path + folder, image)

        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        img = cv2.GaussianBlur(img, (3,3), 1)

        cv2.imwrite(image_path, img)

    print(f"Folder {folder} is blurred")

Folder ( is blurred
Folder ) is blurred
Folder + is blurred
Folder - is blurred
Folder 1 is blurred
Folder slash is blurred
Folder sqrt is blurred
Folder [ is blurred
Folder ] is blurred


# EMNIST Renaming

Might need this later. Right now, it's not much useful.

In [None]:

EMNIST_balanced_path = r"../../dataset/EMNIST/balanced/"
train = EMNIST_balanced_path + r"/train"
test = EMNIST_balanced_path + r"/test"

for char in os.listdir(train):
    char_path = train + r"/" + char
    for i, image_name in enumerate(os.listdir(char_path)):

        try:
            path = os.path.join(char_path, image_name)

            os.rename(path, os.path.join(char_path, f"EMNIST_{image_name}"))

        except Exception as e:
            text = f"Error at train rename: {e}"

            with open("datasetC_error_log.txt", "a+") as F:
                F.write(text)


for char in os.listdir(test):
    char_path = test + r"/" + char
    
    for i, image_name in enumerate(os.listdir(char_path)):

        try:
            path = os.path.join(char_path, image_name)

            os.rename(path, os.path.join(char_path, f"EMNIST_{image_name}"))
            
        except Exception as e:
            text = f"Error at test rename: {e}"

            with open("datasetC_error_log.txt", "a+") as F:
                F.write(text)

# All Sets Combination

In [2]:
import shutil
import os
import random
import cv2
import numpy as np

EMNIST_balanced_path = r"../../dataset/EMNIST/balanced/"
Operator_path = r"../../dataset/Operators/"
Custom_set = r"../../dataset/main2"

## EMNIST Transfer

In [None]:
main_set =r"../../dataset/mainDataset"

os.makedirs(main_set, exist_ok=True)
os.makedirs(main_set+r"/train", exist_ok=True)
os.makedirs(main_set+r"/test", exist_ok=True)

for train_or_test in ["train","test"]:

    main_set_train_test_path = os.path.join(main_set,train_or_test)

    EMNIST_balanced_path_path = os.path.join(EMNIST_balanced_path, train_or_test)

    for char_folder in os.listdir(EMNIST_balanced_path_path):

        try:
            char_folder_path = os.path.join(EMNIST_balanced_path_path,char_folder)

            main_set_char_folder = os.path.join(main_set_train_test_path, char_folder)
            print(main_set_char_folder)

            os.makedirs( main_set_char_folder , exist_ok=True )

            for image_file in os.listdir(char_folder_path):

                src = os.path.join(char_folder_path , image_file)
                drc = os.path.join(main_set_char_folder, image_file)
                shutil.copy2(src,drc)

            print(f"{train_or_test}, char: {char_folder} is Finished!")

        except Exception as e:
            text = f"Error at test rename: {e}"

            with open("datasetC_error_log.txt", "a+") as F:
                F.write(text)

        

../../dataset/mainDataset\train\(
SRC../../dataset/main2_splitted/train\(\CUSTOM_10.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_10.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_100.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_100.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_102.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_102.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_103.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_103.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_104.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_104.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_105.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_105.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_106.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_106.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_107.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_107.jpg
SRC../../dataset/main2_splitted/train\(\CUSTOM_108.jpg
DRC:../../dataset/mainDataset\train\(\CUSTOM_108.

## Operators Separation

In [11]:
operator_path = r"../../dataset/Operators_Resized/"
save_path = r"../../dataset/Operators_splitted/"
train_ratio = 0.8


train_path = os.path.join(save_path, "train")
test_path = os.path.join(save_path, "test")

os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)


for operator_name in os.listdir(operator_path):
    operator_dir = os.path.join(operator_path, operator_name)
    if not os.path.isdir(operator_dir):
        continue  


    all_images = [f for f in os.listdir(operator_dir) if os.path.isfile(os.path.join(operator_dir, f))]
    random.shuffle(all_images)

    split_idx = int(len(all_images) * train_ratio)
    train_images = all_images[:split_idx]
    test_images = all_images[split_idx:]


    train_operator_dir = os.path.join(train_path, operator_name)
    test_operator_dir = os.path.join(test_path, operator_name)
    os.makedirs(train_operator_dir, exist_ok=True)
    os.makedirs(test_operator_dir, exist_ok=True)
    try:

        for img_name in train_images:
            src = os.path.join(operator_dir, img_name)
            dst = os.path.join(train_operator_dir, img_name)
            shutil.copy(src, dst)

        for img_name in test_images:
            src = os.path.join(operator_dir, img_name)
            dst = os.path.join(test_operator_dir, img_name)
            shutil.copy(src, dst)

        print(f"Transfer is done for {operator_name}!")
    except Exception as e:
        text = f"Error at test rename: {e}"

        with open("datasetC_error_log.txt", "a+") as F:
            F.write(text)
        

Transfer is done for (!
Transfer is done for )!
Transfer is done for +!
Transfer is done for -!
Transfer is done for slash!
Transfer is done for sqrt!
Transfer is done for [!
Transfer is done for ]!


## Custom Set Separation 

In [None]:

operator_path = r"../../dataset/mainDataset_somewords_combined"
save_path = r"../../dataset/mainDataset_somewords_combined_splitted/"
train_ratio = 0.8

train_path = os.path.join(save_path, "train")
test_path = os.path.join(save_path, "test")

os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

for operator_name in os.listdir(operator_path):
    if operator_name.startswith("NO_"):
        print(f"Skipped: {operator_name}")
        continue  # "NO_" ile başlayanları atla

    operator_dir = os.path.join(operator_path, operator_name)
    if not os.path.isdir(operator_dir):
        continue

    all_images = [f for f in os.listdir(operator_dir) if os.path.isfile(os.path.join(operator_dir, f))]
    random.shuffle(all_images)

    
    train_images = all_images[:3000]
    test_images = all_images[3000:3601]

    train_operator_dir = os.path.join(train_path, operator_name)
    test_operator_dir = os.path.join(test_path, operator_name)

    os.makedirs(train_operator_dir, exist_ok=True)
    os.makedirs(test_operator_dir, exist_ok=True)

    try:
        
        for img_name in train_images:
            image_path = os.path.join(operator_dir, img_name)
            save_image_path = os.path.join(train_operator_dir, "ENDER_"+img_name)

            shutil.copy(image_path, save_image_path)

       
        for img_name in test_images:
            image_path = os.path.join(operator_dir, img_name)
            save_image_path = os.path.join(test_operator_dir, "ENDER_"+img_name)

            shutil.copy(image_path, save_image_path)

        print(f"Transfer and processing done for {operator_name}!")

    except Exception as e:
        text = f"Error while processing {operator_name}: {e}\n"
        with open("datasetC_error_log.txt", "a+", encoding="utf-8") as F:
            F.write(text)


Transfer and processing done for 0o!
Transfer and processing done for 5s!
Transfer and processing done for 9g!


# Wanted Letters

In [17]:
import os
import shutil

notWanted = ["b", "d", "f", "h", "j", "k", "m", "q", "u", "v", "w","y","z"]

print(notWanted)



main_set = ["../../dataset/mainDataset/train/", "../../dataset/mainDataset/test/"]
main_set_notwanted = ["../../dataset/mainDataset_notWanted/train/", "../../dataset/mainDataset_notWanted/test/"]

for main, main_NW in zip(main_set, main_set_notwanted):
    os.makedirs(main_NW, exist_ok=True)
    print(main)
    for letter in os.listdir(main):
        if letter not in notWanted:     continue
        letter_path_nw = os.path.join(main_NW, letter)
        letter_path = os.path.join(main, letter)
        os.makedirs(letter_path_nw,exist_ok=True)
        
        for img in os.listdir(letter_path):
            try:
                src = os.path.join(letter_path, img)
                drc = os.path.join(letter_path_nw, img)
                shutil.move(src,drc)
            except Exception as e:
                print(f"There Was an error on {img}.       Error: {e}")

        if len(os.listdir(letter_path)) == 0:
            os.rmdir(letter_path)

        print(f"{letter} DONE")


['b', 'd', 'f', 'h', 'j', 'k', 'm', 'q', 'u', 'v', 'w', 'y', 'z']
../../dataset/mainDataset/train/
../../dataset/mainDataset/test/
