# ws 03 Denoise Text

- dataset - https://www.kaggle.com/c/denoising-dirty-documents/data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras import Model, Sequential 
from tensorflow.keras.layers import (InputLayer, Conv2D, 
                                     Dropout, Conv2DTranspose) 
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Google Colab
from google.colab import drive
drive.mount('/content/drive')

%ls '/content/drive/MyDrive/Colab Notebooks/datasets'

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/datasets/autoencoder/'
# path = 'datasets/'

In [None]:
# imgW, imgH = 300, 240  
imgW, imgH = 540, 420  

In [None]:
train_img = sorted(os.listdir(path + '/train'))
train_cleaned_img = sorted(os.listdir(path + '/train_cleaned'))
test_img = sorted(os.listdir(path + '/test'))

print(f'X Train: {len(train_img)} , Y Train: {len(train_cleaned_img)}')
print('X Test:', len(test_img))

train_img[:5]

In [None]:
img = load_img(path+'train/'+train_img[0], target_size=(imgH, imgW),
              color_mode='grayscale')
print(img_to_array(img).shape)

img

In [None]:
img = load_img(path+'train_cleaned/'+train_img[0], target_size=(imgH, imgW))
img

In [None]:
img = load_img(path+'test/'+test_img[0], target_size=(imgH, imgW))
img

In [None]:
def process_image(file):
    img = load_img(file, target_size=(imgH, imgW), color_mode='grayscale')
    img = img_to_array(img) 
    img = img/255.0
    return img

In [None]:
train = []
train_cleaned = []
test = []

for f in tqdm(train_img):
    train.append(process_image(path + 'train/' + f))

for f in tqdm(train_cleaned_img):
    train_cleaned.append(process_image(path + 'train_cleaned/' + f))
    
for f in tqdm(test_img):
    test.append(process_image(path + 'test/' + f))


In [None]:
print('Train0 shape:', train[0].shape)

In [None]:
plt.figure(figsize=(15,22))
for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train[i][:,:,0], cmap='gray')
    plt.title('Noise image: {}'.format(train_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train_cleaned[i][:,:,0], cmap='gray')
    plt.title('Clean image: {}'.format(train_img[i]))

plt.show()

In [None]:
X_train = np.asarray(train)
y_train = np.asarray(train_cleaned)
X_test = np.asarray(test)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

X_train.shape, X_val.shape

### Model

In [None]:
class Denoise(Model):
    def __init__(self):
        super(Denoise, self).__init__()
        self.encoder = Sequential([
#             InputLayer(input_shape=(imgW, imgH, 1)),  
            Conv2D(64, (3, 3), activation='relu', padding='same', strides=2),
            Conv2D(128, (3, 3), activation='relu', padding='same', strides=2),
            # BatchNormalization(),
            # MaxPooling2D((2, 2), padding='same'),
            Dropout(0.4) ])

        self.decoder = Sequential([
            Conv2DTranspose(128, kernel_size=3, strides=2, activation='relu', padding='same'),
            Conv2DTranspose(64, kernel_size=3, strides=2, activation='relu', padding='same'),
            Conv2D(1, kernel_size=(3, 3), activation='sigmoid', padding='same')])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Denoise()

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
callback = EarlyStopping(monitor='loss', patience=20)
history = autoencoder.fit(X_train, y_train, validation_data=(X_val, y_val), 
                    epochs=100, 
                    batch_size=16, verbose=1, callbacks=[callback])

In [None]:
autoencoder.encoder.summary()
autoencoder.decoder.summary()

In [None]:
plt.figure(figsize=(5, 3.8))
plt.plot(history.history['loss'], color='b', label="Training")
plt.plot(history.history['val_loss'], 'r', lw=3, label="Validation")
plt.title('Loss')
plt.legend()
plt.show()

In [None]:
# clean test images
Y_pred = autoencoder.predict(X_test, batch_size=16)  

In [None]:
plt.figure(figsize=(15,25))
for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(X_test[i][:,:,0], cmap='gray')
    plt.title('Noisy image: {}'.format(test_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(Y_pred[i][:,:,0], cmap='gray')
    plt.title('Denoised by Autoencoder: {}'.format(test_img[i]))

plt.show()

## clean Text

In [None]:
from google.colab import files

uploaded = files.upload()
%ls

In [None]:
img = load_img('text_noisy.png', target_size=(imgH, imgW), color_mode='grayscale')
img

In [None]:
img = img_to_array(img) 
img = img/255.0
print(img.shape)
print(np.max(img), np.min(img))

In [None]:
img = np.expand_dims(img, axis=0)
img.shape   # 

In [None]:
Y_pred = autoencoder.predict(img)
Y_pred = np.squeeze(Y_pred)
print(Y_pred.shape)

plt.figure(figsize=(10, 7))
plt.imshow(Y_pred, cmap='gray')
plt.xticks([]), plt.yticks([])
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
plt.imshow(np.squeeze(img), cmap='gray')
plt.show()