In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.linear_model import LogisticRegression
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
(X_train, y_train), (X_test, y_test) =tf.keras.datasets.mnist.load_data()

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
plt.figure()
plt.title(label=f'Digit: {y_train[0]}')
plt.imshow(X=X_train[0], cmap='gray')

In [None]:
def pad(arr: np.ndarray, h_pad: int = 0, v_pad: int = 0):
    rows = len(arr)

    horizontal_pad = np.zeros(shape=(rows, arr.shape[1], h_pad))
    vertical_pad = np.zeros(shape=(rows, v_pad, arr.shape[2] + h_pad * 2))

    arr_copy = np.concatenate((horizontal_pad, arr, horizontal_pad), axis=2)
    arr_copy = np.concatenate((vertical_pad, arr_copy, vertical_pad), axis=1)

    return arr_copy


In [None]:
def sort_columns_hilbert(arr, cache=None):
    arr_copy = arr.copy()
    rows = len(arr_copy)

    arr_shape = arr.shape

    from math import log2

    if cache is None:
        from hilbertcurve.hilbertcurve import HilbertCurve
        hcurve = HilbertCurve(log2(arr_shape[1]), 2)
        h_arr = np.array(hcurve.points_from_distances(range(arr_shape[1] ** 2)))
        arr_resorted = np.empty(shape=(rows, 0))
        for i, (r, c) in enumerate(h_arr):
            arr_resorted = np.concatenate((arr_resorted, arr_copy[:, r, c].reshape(-1, 1)), axis=1)
            print(f'\ri={i};r={r};c={c}', end='')
        arr_resorted.shape
    else:
        import pickle
        with open(file=cache, mode='rb') as f:
            arr_resorted = pickle.load(f)
    
    return arr_resorted

In [None]:
def compress(arr, chunk_size=4, agg=np.mean):
    if chunk_size > 1:
        arr_copy = np.empty(shape=(arr.shape[0], 0))
        for arr_ in np.array_split(ary=arr, indices_or_sections=1024 / chunk_size, axis=1):
            arr_copy = np.concatenate((arr_copy, agg(arr_, axis=1).reshape((-1, 1))), axis=1)
        return arr_copy
    else:
        return arr

In [None]:
def save_obj_to_binary(obj, filepath):
    import pickle
    try:
        with open(file=filepath, mode='wb') as f:
            pickle.dump(obj=obj, file=f)
        return 0
    except:
        return 1

In [None]:
def evaluate_model(X_train, y_train, X_test, y_test, model):
    from sklearn.metrics import accuracy_score

    model.fit(X_train, y_train)
    
    acc = accuracy_score(y_train, model.predict(X_train))
    val_acc = accuracy_score(y_test, model.predict(X_test))

    return acc, val_acc

In [None]:
cs = 4

X_train2 = compress(X_train.reshape((60000, 28 * 28)), chunk_size=cs)
X_test2 = compress(X_test.reshape((10000, 28 * 28)), chunk_size=cs)

evaluate_model(X_train2, y_train, X_test2, y_test, LogisticRegression(max_iter=20))

In [None]:
cs = 4

X_train3 = compress(
    arr=sort_columns_hilbert(pad(X_train, 2, 2), cache='../models/reordered_train.p'), chunk_size=cs)
X_test3 = compress(
    arr=sort_columns_hilbert(pad(X_test, 2, 2), cache='../models/reordered_test.p'), chunk_size=cs)
evaluate_model(X_train3, y_train, X_test3, y_test,
               LogisticRegression(max_iter=20))


In [None]:
def find_best_cs(X_train, X_test, y_train, y_test, start=2, end=32):
    scores = []
    for i in range(start, end + 1):
        print(f'\rIteration {i} / {end}', end='')
        X_train_ = compress(
            arr=sort_columns_hilbert(pad(X_train, 2, 2), cache='../models/reordered_train.p'), chunk_size=i)
        X_test_ = compress(
            arr=sort_columns_hilbert(pad(X_test, 2, 2), cache='../models/reordered_test.p'), chunk_size=i)
        scores.append([*evaluate_model(X_train_, y_train, X_test_, y_test,
                       LogisticRegression(max_iter=10))])
    return np.array(scores)

grid_search_chunk_size = find_best_cs(X_train, X_test, y_train, y_test)

In [None]:
# save_obj_to_binary(obj=grid_search_chunk_size, filepath='../models/grid_search_cs.p')

In [None]:
sb.lineplot(list(range(grid_search_chunk_size.shape[0])), grid_search_chunk_size[:,0])
sb.lineplot(list(range(grid_search_chunk_size.shape[0])), grid_search_chunk_size[:,1])

In [None]:
X_test4 = X_test3 / 255.0
X_train4 = X_train3 / 255.0

evaluate_model(X_train4, y_train, X_test4, y_test, LogisticRegression(max_iter=20))