In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import cv2
import os

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

DATASET_PATH = "dataset"
TRAIN_FILE = os.path.join(DATASET_PATH, "emnist-byclass-train.csv")
TEST_FILE = os.path.join(DATASET_PATH, "emnist-byclass-test.csv")
MAPPING_FILE = os.path.join(DATASET_PATH, "emnist-byclass-mapping.txt")

In [3]:
def load_emnist_byclass():
    # Загружаем тренировочные и тестовые данные
    train_data = pd.read_csv(TRAIN_FILE)
    test_data = pd.read_csv(TEST_FILE)
    
    # Маппинг классов (номера к символам)
    # Формат: номер_класса ASCII_код
    mapping = pd.read_csv(MAPPING_FILE, 
                          sep=' ', 
                          header=None, 
                          names=['class', 'ascii'])
    
    print(f"Тренировочные данные: {train_data.shape}")
    print(f"Тестовые данные: {test_data.shape}")
    print(f"Количество классов: {len(mapping)}")
    print(f"Пример маппинга:\n{mapping.head(10)}")
    
    return train_data, test_data, mapping

# Загрузка данных
train_data, test_data, mapping = load_emnist_byclass()

Тренировочные данные: (697931, 785)
Тестовые данные: (116322, 785)
Количество классов: 62
Пример маппинга:
   class  ascii
0      0     48
1      1     49
2      2     50
3      3     51
4      4     52
5      5     53
6      6     54
7      7     55
8      8     56
9      9     57


In [4]:
def prepare_emnist_data(train_data, test_data):
    X_train = train_data.iloc[:, 1:].values
    y_train = train_data.iloc[:, 0].values
    X_test = test_data.iloc[:, 1:].values
    y_test = test_data.iloc[:, 0].values
    print(f"   X_train shape: {X_train.shape}")
    print(f"   y_train shape: {y_train.shape}")

    X_train = X_train.reshape(-1, 28, 28, 1)  #(все строки, 28 на 28, 1 канал)
    X_test = X_test.reshape(-1, 28, 28, 1)

    X_train = np.transpose(X_train, (0, 2, 1, 3))
    X_test = np.transpose(X_test, (0, 2, 1, 3))

    X_train = X_train.astype('float32') / 255.0
    X_test = X_test.astype('float32') / 255.0
    
    num_classes = 62
    y_train_cat = keras.utils.to_categorical(y_train, num_classes)
    y_test_cat = keras.utils.to_categorical(y_test, num_classes)
    
    print(f"   После преобразования:")
    print(f"   X_train shape: {X_train.shape}")
    print(f"   y_train_cat shape: {y_train_cat.shape}")
    
    return X_train, X_test, y_train_cat, y_test_cat, y_train, y_test

X_train, X_test, y_train_cat, y_test_cat, y_train, y_test = prepare_emnist_data(train_data, test_data)

   X_train shape: (697931, 784)
   y_train shape: (697931,)
   После преобразования:
   X_train shape: (697931, 28, 28, 1)
   y_train_cat shape: (697931, 62)


In [8]:
def create_cnn_model_improved(input_shape=(28, 28, 1), num_classes=62):
    model = keras.Sequential([
        # Первый сверточный блок
        layers.Conv2D(20, (3, 3), activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.2),
        
        # Второй сверточный блок  
        layers.Conv2D(40, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.3),
        
        # Полносвязный слой
        layers.Flatten(),
        layers.Dense(80, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Создаем модель
model = create_cnn_model_improved()
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [10]:
from sklearn.model_selection import train_test_split

# Разделяем тренировочные данные на train/validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train_cat, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train  # сохраняем пропорции классов
)

print(f"Размеры:")
print(f"  X_train_split: {X_train_split.shape}")
print(f"  X_val: {X_val.shape}")

history = model.fit(
    X_train_split, y_train_split,
    validation_data=(X_val, y_val),
    epochs=30,  # можно начать с 30 эпох
    batch_size=64,
    verbose=1
)

Размеры:
  X_train_split: (558344, 28, 28, 1)
  X_val: (139587, 28, 28, 1)
Epoch 1/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 37ms/step - accuracy: 0.7745 - loss: 0.7200 - val_accuracy: 0.8431 - val_loss: 0.4488
Epoch 2/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 30ms/step - accuracy: 0.8158 - loss: 0.5485 - val_accuracy: 0.8486 - val_loss: 0.4267
Epoch 3/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 32ms/step - accuracy: 0.8223 - loss: 0.5249 - val_accuracy: 0.8516 - val_loss: 0.4129
Epoch 4/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 39ms/step - accuracy: 0.8266 - loss: 0.5092 - val_accuracy: 0.8522 - val_loss: 0.4102
Epoch 5/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 40ms/step - accuracy: 0.8279 - loss: 0.5010 - val_accuracy: 0.8523 - val_loss: 0.4055
Epoch 6/30
[1m8725/8725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 42ms/step

In [11]:
# Сохранение модели в формате .keras
model.save('models/CNN_model_improver.keras')

# Сохранение маппинга для использования в приложении
mapping.to_csv('models/emnist_mapping.csv', index=False)
print("Маппинг сохранен как 'emnist_mapping.csv'")

Маппинг сохранен как 'emnist_mapping.csv'
