# KaggleDownload

In [74]:
import kagglehub

path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tmdb/tmdb-movie-metadata?dataset_version_number=2...


100%|██████████| 8.89M/8.89M [00:01<00:00, 7.33MB/s]

Extracting files...





Path to dataset files: C:\Users\EnderEfe\.cache\kagglehub\datasets\tmdb\tmdb-movie-metadata\versions\2


# Custom Dataset Processing

## Importing Required Libraries and Definings

In [None]:
import os
import cv2
import numpy as np

dataset_path = r"E:\\Python_Projeler\\ComputerVisionProjects\\FinalProject\\dataset"

## Character Mapping

In [None]:
letter_dict = {
    '0001': 'a', '0002': 'b', '0003': 'c', '0004': 'd', '0005': 'e', '0006': 'f',
    '0007': 'h', '0008': 'vertical_line', '0009': 'j', '0010': 'k', '0011': 'vertical_line',
    '0012': 'm', '0013': 'n', '0014': 'o', '0015': 'p', '0016': 'q', '0017': 'r',
    '0018': 's', '0019': 't', '0020': 'u', '0021': 'v', '0022': 'w', '0023': 'x',
    '0024': 'y', '0025': 'z', '0026': '0', '0027': '1', '0028': '2', '0029': '3',
    '0030': '4', '0031': '5', '0032': '6', '0033': '7', '0034': '8', '0035': '9',
    '0036': 'plus', '0037': 'horizontal_line', '0038': 'slash',
    '0039': 'paranthesis_left', '0040': 'paranthesis_right', '0041': 'sqrt', '0042': 'sqrt'}

label_dict = {label: 0 for label in set(letter_dict.values())}

## Character Extraction Function

In [None]:
def ExtractSet(path, save_path, name=None):
    page = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    kernel = np.ones((4, 4), np.uint8)
    blur = cv2.GaussianBlur(page, (3, 3), 1)
    _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    dilation = cv2.dilate(thresh, kernel, iterations=2)
    dilation_2 = cv2.dilate(thresh, kernel=np.ones((1, 1), np.uint8))  # Optional, may vary per dataset

    contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for index, contour in enumerate(contours):
        mask = np.zeros_like(dilation)
        cv2.drawContours(mask, [contour], -1, 255, thickness=cv2.FILLED)
        MaskedLetter = cv2.bitwise_and(dilation_2, dilation_2, mask=mask)

        if name is not None:
            index = label_dict[name]
            label_dict[name] = index + 1

        x, y, w, h = cv2.boundingRect(contour)
        max_edge = max(h, w)
        blank = np.zeros((max_edge, max_edge), np.uint8)

        x1, x2 = int((max_edge - h) / 2), int((max_edge + h) / 2)
        y1, y2 = int((max_edge - w) / 2), int((max_edge + w) / 2)

        blank[x1:x2, y1:y2] = MaskedLetter[y:y + h, x:x + w]
        letter = cv2.resize(blank, (64, 64))

        cv2.imwrite(save_path + f"\\{index}.jpg", letter)

## Process Dataset Image Pages

In [None]:
for dSet_folder in os.listdir(dataset_path):
    if len(dSet_folder.split(".")) > 1:
        continue

    dSet_path = os.path.join(dataset_path, dSet_folder)

    for index, letter_page in enumerate(os.listdir(dSet_path)):
        if letter_page.split(".")[-1] != "jpg":
            continue

        letter_page_path = os.path.join(dSet_path, letter_page)
        letter_number = letter_page.split(".")[0].split("-")[-1]
        letter_name = letter_dict[letter_number]

        print(f"{letter_name}        {letter_number}")

        save_path = os.path.join(dSet_path, letter_name)
        os.makedirs(save_path, exist_ok=True)

        ExtractSet(letter_page_path, save_path, letter_name)

# EMNIST Extracting

In [70]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import gzip

EMNIST_type = "balanced"

#This Mapping is for balanced ONLY!! You have to check emnist-TYPE-mapping.txt file to correct the mapping!
LabelMapping = {
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7",
    8: "8",
    9: "9",
    10: None,
    11: None,
    12: "upper_c",
    13: None,
    14: None,
    15: None,
    16: None,
    17: None,
    18: None,
    19: "upper_j",
    20: "upper_k",
    21: "upper_l",
    22: "upper_m",
    23: None,
    24: "upper_o",
    25: "upper_p",
    26: None,
    27: None,
    28: "upper_s",
    29: None,
    30: "upper_u",
    31: "upper_v",
    32: "upper_w",
    33: "upper_x",
    34: "upper_y",
    35: "upper_z",
    36: "lower_a",
    37: "lower_b",
    38: "lower_d",
    39: "lower_e",
    40: "lower_f",
    41: "lower_g",
    42: "lower_h",
    43: "lower_n",
    44: "lower_q",
    45: "lower_r",
    46: "lower_t"
}


notWanted = ['k', "j","m", "q", "z", "v","u"]

LabelCounter = {
    "0": 0,
    "1": 0,
    "2": 0,
    "3": 0,
    "4": 0,
    "5": 0,
    "6": 0,
    "7": 0,
    "9": 0,
    "8": 0,
    "z": 0,
    "a": 0,
    "b": 0,
    "c": 0,
    "d": 0,
    "e": 0,
    "f": 0,
    "g": 0,
    "h": 0,
    "i": 0,
    "j": 0,
    "k": 0,
    "l": 0,
    "m": 0,
    "n": 0,
    "o": 0,
    "p": 0,
    "q": 0,
    "r": 0,
    "s": 0,
    "t": 0,
    "u": 0,
    "v": 0,
    "w": 0,
    "x": 0,
    "y": 0,
}

In [71]:
def ReadIdxImage(file):
    with gzip.open(file,"rb") as F:
        
        magic_number = int.from_bytes(F.read(4), 'big') 
        num_items = int.from_bytes(F.read(4), 'big')  
        num_rows = int.from_bytes(F.read(4), 'big') 
        num_cols = int.from_bytes(F.read(4), 'big')  

        data = np.frombuffer(F.read(),dtype = np.uint8)
        data = data.reshape(num_items,num_rows,num_cols)

    
    return data


def ReadIdxLabel(file):
    with gzip.open(file,"rb") as F:

        F.read(8)

    
        return F.read()

In [72]:


for train_or_test in ["train", "test"]:
    
    images_path = r"EMNIST/"+EMNIST_type+r"/emnist-"+EMNIST_type+r"-"+train_or_test+"-images-idx3-ubyte.gz"
    labels_path = r"EMNIST/"+EMNIST_type+r"/emnist-"+EMNIST_type+r"-"+train_or_test+"-labels-idx1-ubyte.gz"

    save_path = r"../../dataset/EMNIST/"+EMNIST_type + r"/" +train_or_test



    os.makedirs(save_path, exist_ok=True)

    images = ReadIdxImage(images_path)

    labels = ReadIdxLabel(labels_path)

    for label in LabelMapping:
        
        char = LabelMapping[label]
        if char is None:    continue
        char = char[-1]
        
        path = save_path + r"/" + char
        os.makedirs(path, exist_ok=True )
        print(f"Folder {path} is created or already exists")



    error_text = ""

    for i,(image, label) in enumerate(zip(images,labels)):
        if LabelMapping[label] is None:
            continue

        char = LabelMapping[label][-1]
        
        if char in notWanted:   
            LabelCounter[char] += 1
            continue

        char_index = LabelCounter[char]  

        try:
            save = save_path + r"/" + char +r"/"

            image = cv2.resize(image,(48,48), interpolation=cv2.INTER_NEAREST)
            cv2.imwrite(save + f"{char_index}.png",image.T)
            LabelCounter[char] += 1

        except Exception as e:
            error_text+= f"There was an error on {char} - {EMNIST_type}  -    Error: {e}\n\n"

    with open("/datasetC_error_log.txt", "a+") as F:
        F.write(error_text)

    print(f"EMNIST Dataset Extraction is completed.")
    print(LabelCounter)



Folder ../../dataset/EMNIST/balanced/train/0 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/1 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/2 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/3 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/4 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/5 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/6 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/7 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/8 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/9 is created or already exists
Folder ../../dataset/EMNIST/balanced/train/c is created or already exists
Folder ../../dataset/EMNIST/balanced/train/j is created or already exists
Folder ../../dataset/EMNIST/balanced/train/k is created or already exists
Folder ../../dataset/EMNIST/balanced/t

# Operators Dilation

In [None]:
path = r"../../dataset/Operators/"

for foder in os.listdir(path):
    

# All Sets Combination

In [None]:
EMNIST_balanced_path = r"../../dataset/EMNIST/balanced/"
Operator_path = r"../../dataset/Operators/"
Custom_set_1 = r"../../dataset/berna"
Custom_set_2 = r"../../dataset/ender"


