### Import Statements

In [12]:
import numpy as np
import pandas as pd
import sklearn as sk
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as mpl
import json
from PIL import Image, ImageDraw
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [13]:
def read_and_label_files(folder_path):
    all_drawings = []
    for file_path in glob.glob(os.path.join(folder_path, '*.ndjson')):
        with open(file_path, 'r') as file:
            for line in file:
                drawing_data = json.loads(line)
                # Extract drawing and label
                drawing = drawing_data['drawing']
                label = drawing_data['word']
                # Append to the list as a tuple (drawing, label)
                all_drawings.append((drawing, label))
    return all_drawings

def shuffle_data(data, seed=42):
    # Shuffle the data to mix categories
    shuffled_data = shuffle(data, random_state=seed)
    return shuffled_data

def create_batches(data, batch_size=50):
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i+batch_size]
        yield batch_data

In [14]:
def get_labels(folder_path):
    """Get all file paths and labels in the given folder."""
    file_paths = glob.glob(os.path.join(folder_path, '*.ndjson'))
    labels = [os.path.basename(path).split('.')[0] for path in file_paths] # Assumes the file name represents the category
    return labels

folder_path = "reduced_data/"
labels = get_labels(folder_path)
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on all possible labels

data = shuffle_data(read_and_label_files(folder_path))

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [15]:
def strokes_to_image(strokes, image_size=(256, 256)):
    """Convert stroke data to an image."""
    image = Image.new("L", image_size, color=0)
    draw = ImageDraw.Draw(image)
    
    for stroke in strokes:
        for i in range(len(stroke[0]) - 1):
            draw.line([stroke[0][i], stroke[1][i], stroke[0][i + 1], stroke[1][i + 1]], fill=255, width=2)

    return np.array(image)


def load_data_in_batches(data, batch_size=50, image_size=(256, 256), num_classes=345):
    """Load and preprocess data directly from tuples in batches, with labels one-hot encoded."""
    batch_X, batch_y = [], []
    for drawing, label in data:
        image = strokes_to_image(drawing, image_size=image_size)
        batch_X.append(image)
        
        label_idx = label_encoder.transform([label])[0]  # Convert label to index
        label_one_hot = to_categorical(label_idx, num_classes=num_classes)
        batch_y.append(label_one_hot)

        if len(batch_X) == batch_size:
            yield (np.array(batch_X), np.array(batch_y))
            batch_X, batch_y = [], []

    if batch_X:  # Yield any remaining data as the last batch
        yield (np.array(batch_X), np.array(batch_y))


# Example usage


# Model definition
def create_model(input_shape, num_classes):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))
    return model





In [16]:
# Assuming you have a way to determine the input shape and number of classes
model = create_model(input_shape=(256, 256, 1), num_classes=345)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['AUC'])

  super().__init__(


In [17]:
checkpoint_path = "training_1/epoch_{epoch:02d}-val_loss_{val_loss:.2f}.weights.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = ModelCheckpoint(filepath=checkpoint_path,
                              save_weights_only=True,
                              verbose=1,
                              save_best_only=False) # `period` is optional here as it defaults to 1


In [19]:
# Assuming `load_data_in_batches` is corrected to encode labels within it
train_generator = load_data_in_batches(train_data, batch_size=256)
test_generator = load_data_in_batches(test_data, batch_size=256)

# Calculate steps per epoch for training and validation
steps_per_epoch = len(train_data) // 256
validation_steps = len(test_data) // 256

# Training with validation
model.fit(train_generator, 
          epochs=10,  # Or however many epochs you wish to train for
          steps_per_epoch=steps_per_epoch,
          validation_data=test_generator,
          callbacks = [cp_callback],
          validation_steps=validation_steps)


Epoch 1/10
[1m   2/5390[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:19:01[0m 9s/step - AUC: 0.6176 - loss: 32.5698 

In [None]:
model.save('my_model.h5')