In [1]:
from IPython import display

import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
import cv2
import numpy as np
import glob
import os
import tensorflow_probability as tfp
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
import time
import copy
import re
from sklearn import neighbors
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import sqlalchemy as db

pd.options.mode.chained_assignment = None
# https://stackoverflow.com/questions/58352326/running-the-tensorflow-2-0-code-gives-valueerror-tf-function-decorated-functio
tf.config.run_functions_eagerly(True)

In [3]:
directory = "/home/ggdhines/bear/"

In [4]:
def plot_character(character_df,index):
    row = character_df.loc[index]
    img = cv2.imread(directory+row["fname"])

    tile = img[row.top:row.bottom,row.left:row.right]

    _ = plt.imshow(tile)

In [36]:
tile_df = []
tile
master_list = [chr(ord('0')+i) for i in range(10)]

for fname in os.listdir(directory):
    match = re.search("(.*)-(\d+)-(\d+)\-(\d*)_ocr_ready.png",fname)
    if match is None:
        continue
                
    print(fname,end="\r")
        
    csv_file = fname[:-13] + "ocr.csv"
    img = cv2.imread(directory+fname,0)
    
    tiles_on_page = pd.read_csv(directory+csv_file,delimiter=" ",error_bad_lines=False, engine="python",quoting=3)
    
#     m1 = tiles["confidence"] > 95    
#     tiles = tiles[m1]
    
    max_darkness = []
    for _,row in tiles_on_page.iterrows():
        tile = img[row.top:row.bottom,row.left:row.right]
        resized_tile = cv2.resize(tile,(28,28))
        max_darkness.append(np.min(tile))
        tiles.append(resized_tile)
        
    tiles_on_page["max_darkness"] = max_darkness
#     df["fname"] = fname
    
    try:
        tiles_on_page["ship_name"] = match.groups()[0]
        tiles_on_page["year"] = int(match.groups()[1])
        tiles_on_page["month"] = int(match.groups()[2])
        tiles_on_page["page_number"] = int(match.groups()[3])
    except ValueError:
        print(match.groups())
        raise
    
    all_characters.append(df)
    
tile_df = pd.concat(tiles,ignore_index=True)
tile_df["area"] = (tile_df["right"] - tile_df["left"]) * (tile_df["bottom"] - tile_df["top"])
tiles = np.asarray(tiles)
s = tiles.shape
tiles = tiles.reshape((s[0],s[1],s[2],1))

Bear-AG-29-1940-01-49_ocr_ready.png

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [19]:
engine = db.create_engine('postgres://ghines:123456@127.0.0.1:5432/historical-transcriptions')

Enter in any newly added pages and get the page index for all pages

In [34]:
pages_df = tile_df[columns].drop_duplicates()

already_entered_pages = pd.read_sql("pages",engine)

display.display(already_entered_pages.head())

columns = ["ship_name","year","month","page_number"]
to_add = pages_df.merge(already_entered_pages,how="left",on=columns)
to_add = to_add[to_add["page_id"].isna()]

display.display(to_add)
to_add[columns].to_sql("pages",engine,if_exists="append",index=False)

all_pages = pd.read_sql("pages",engine)

Unnamed: 0,ship_name,year,month,page_number,page_id
0,Bear-AG-29,1940,1,11,32
1,Bear-AG-29,1940,1,21,33
2,Bear-AG-29,1940,1,63,34
3,Bear-AG-29,1940,1,25,35
4,Bear-AG-29,1940,1,13,36


KeyError: 'ship_name'

Some ink pixels are fairly bright (barely indistinguishable from paper pixels), and so we set our threshold for paper/ink fairly high. However, for a full character, there should be at least some dark pixels, i.e. the bright ink pixels are on the boundary between ink and paper. The interior of the character will be dark. So if Tesseract finds a character where every pixel is bright, we should be worried.

In [None]:
_ = plt.hist(all_characters["max_darkness"])

In [None]:
fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
all_characters.plot.scatter(x="max_darkness",y="area",ax=ax)
ax.set_yscale('log')
plt.show()

In [None]:
m = all_characters["max_darkness"] <= 170
all_characters[m].sort_values("max_darkness",ascending=False).head(5)

Note that all of these characters are completely suspect, but Tesseract still feels fairly confident. Also, for most the area seems far too small.

In [None]:
plot_character(all_characters,6771)

We can play around with the threshold and see that requiring max_darkness <= 170 seems reasonable. (Might be nice to create a more automated filter in the future.) From below, we see that this filters out about 7% of the characters.

In [None]:
darkness_mask = all_characters["max_darkness"] <= 170
s1 = all_characters[darkness_mask].shape[0]
s2 = all_characters.shape[0]
print(s1/s2)

Next, characters which are way too big or small need to be examined. Filtering out impossibly small characters is straightforward. However, it is a bit more complicated with bigger characters. These often include the correct character plus a whole bunch more. We will filter them out for now, since all of this is going to be fed into the autoencoder which benefits from characters being as similar as possible. However, we will want to feed these "characters" through the website to have the bounding boxes correct, whereas we can just drop the overly small characters completely.

In [None]:
filtered_characters = all_characters[darkness_mask]
filtered_characters.sort_values("area")

In [None]:
plot_character(all_characters,14674)

In [None]:
fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
bins = range(filtered_characters["area"].min(),filtered_characters["area"].max(),1000)
filtered_characters["area"].hist(bins=bins)
ax.set_xscale('log')
plt.show()

In [None]:
m = filtered_characters["area"] <= 10000
filtered_characters[m].sort_values("area",ascending=False).head(5)

In [None]:
plot_character(all_characters,24215)

An upper bound of 1000 for area seems good, but we could probably reduce it a bit too if we wanted to

In [None]:
m = filtered_characters["area"] > 2000
filtered_characters[m].sort_values("area").head(5)

The bounding box below is definitely too small, but the character is correct. So we may need to follow up on such bounding boxes in the website. (Also check to see if we have overlapping bounding boxes.)

In [None]:
plot_character(all_characters,12152)

In [None]:
m1 = filtered_characters["area"] <= 10000
m2 = filtered_characters["area"] >= 2000

double_filtered_characters = filtered_characters[m1&m2]
s1 = double_filtered_characters.shape[0]
s2 = all_characters.shape[0]
print(s1/s2)

In [None]:
def split(character_df,tiles):
    train_df,test_df = train_test_split(character_df, test_size=0.25, random_state=0)

    train_images = tiles[train_df.index]
    # we no longer need this index wrt to the original dataframe
    train_df = train_df.reset_index(drop=True)

    test_images = tiles[test_df.index]
    test_df = test_df.reset_index(drop=True)

    train_images = train_images / 255
    test_images = test_images / 255

    # sanity check
    assert np.max(train_images) == 1
    return train_images,test_images,train_df,test_df
    

In [None]:
train_images,test_images,train_df,test_df = split(double_filtered_characters,tiles)

In [None]:
display.display(train_df.iloc[0])
_ = plt.imshow(train_images[0])

The following code is taken from https://www.tensorflow.org/tutorials/generative/cvae

In [None]:
class CVAE(tf.keras.Model):
    """Convolutional variational autoencoder."""

    def __init__(self, latent_dim):
        super(CVAE, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential(
            [
                tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),
                tf.keras.layers.Conv2D(
                    filters=32, kernel_size=3, strides=(2, 2), activation='relu'),
                tf.keras.layers.Conv2D(
                    filters=64, kernel_size=3, strides=(2, 2), activation='relu'),
                tf.keras.layers.Flatten(),
                # No activation
                tf.keras.layers.Dense(latent_dim + latent_dim),
            ]
        )

        self.decoder = tf.keras.Sequential(
            [
                tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
                tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),
                tf.keras.layers.Reshape(target_shape=(7, 7, 32)),
                tf.keras.layers.Conv2DTranspose(
                    filters=64, kernel_size=3, strides=2, padding='same',
                    activation='relu'),
                tf.keras.layers.Conv2DTranspose(
                    filters=32, kernel_size=3, strides=2, padding='same',
                    activation='relu'),
                # No activation
                tf.keras.layers.Conv2DTranspose(
                    filters=1, kernel_size=3, strides=1, padding='same'),
            ]
        )

    @tf.function
    def sample(self, eps=None):
        if eps is None:
            eps = tf.random.normal(shape=(100, self.latent_dim))
        return self.decode(eps, apply_sigmoid=True)

    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(logvar * .5) + mean

    def decode(self, z, apply_sigmoid=False):
        logits = self.decoder(z)
        if apply_sigmoid:
            probs = tf.sigmoid(logits)
            return probs
        return logits
    



def log_normal_pdf(sample, mean, logvar, raxis=1):
    log2pi = tf.math.log(2. * np.pi)
    return tf.reduce_sum(
      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
      axis=raxis)


def compute_loss(model, x):
    mean, logvar = model.encode(x)
    z = model.reparameterize(mean, logvar)
    x_logit = model.decode(z)

    # not sure why the following had to be added in
    x = tf.cast(x, tf.float32)
    cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
    logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])
    logpz = log_normal_pdf(z, 0., 0.)
    logqz_x = log_normal_pdf(z, mean, logvar)
    return -tf.reduce_mean(logpx_z + logpz - logqz_x)


@tf.function
def train_step(model, x, optimizer):
    """Executes one training step and returns the loss.

    This function computes the loss and gradients, and uses the latter to
    update the model's parameters.
    """
    with tf.GradientTape() as tape:
        loss = compute_loss(model, x)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
def generate_images(model, epoch, test_sample):
    mean, logvar = model.encode(test_sample)
    z = model.reparameterize(mean, logvar)
    predictions = model.sample(z)
    fig = plt.figure(figsize=(4, 4))

    for i in range(predictions.shape[0]):
        plt.subplot(4, 4, i + 1)
        plt.imshow(predictions[i, :, :, 0], cmap='gray')
        plt.axis('off')

    # tight_layout minimizes the overlap between 2 sub-plots
#     plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))
    plt.show()


In [None]:
def train_model(train_images,test_images,model_name,epochs):
    # set the dimensionality of the latent space to a plane for visualization later
    latent_dim = 2
    num_examples_to_generate = 16
    batch_size = 64
    
    train_size = train_images.shape[0]
    test_size = test_images.shape[0]


    train_dataset = (tf.data.Dataset.from_tensor_slices(train_images)
                     .shuffle(train_size).batch(batch_size))
    test_dataset = (tf.data.Dataset.from_tensor_slices(test_images)
                    .shuffle(test_size).batch(batch_size))

    assert batch_size >= num_examples_to_generate
    for test_batch in test_dataset.take(1):
        test_sample = test_batch[0:num_examples_to_generate, :, :, :]
    
    # keeping the random vector constant for generation (prediction) so
    # it will be easier to see the improvement.
    random_vector_for_generation = tf.random.normal(
        shape=[num_examples_to_generate, latent_dim])
    model = CVAE(latent_dim)

    generate_images(model, 0, test_sample)
    optimizer = tf.keras.optimizers.Adam(1e-4)
    
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        for train_x in train_dataset:
            train_step(model, train_x, optimizer)
        end_time = time.time()

        loss = tf.keras.metrics.Mean()
        for test_x in test_dataset:
            loss(compute_loss(model, test_x))
        elbo = -loss.result()


        display.clear_output(wait=False)
        print('Epoch: {}, Test set ELBO: {},time elapse for current epoch: {}'
            .format(epoch, elbo,end_time - start_time))
        generate_and_save_images(model, epoch, test_sample)
        
        
            
    return model

def load_or_train_model(train_images,test_images,model_name,epochs):
    """
    load a previously trained model. If no such model exists, train one
    """
    weights_file = f"{directory}weights_{model_name}"
    
    if not os.path.exists(weights_file+".index"):
        model = train_model(train_images,test_images,model_name,epochs)
        model.save_weights(weights_file)
    else:
        print("loading")
        latent_dim = 2
        num_examples_to_generate = 16
        batch_size = 64
        
        model = CVAE(latent_dim)
        model.load_weights(weights_file)
        
        # show how the loaded model is doing
        test_size = test_images.shape[0]
        test_dataset = (tf.data.Dataset.from_tensor_slices(test_images)
                    .shuffle(test_size).batch(batch_size))
        for test_batch in test_dataset.take(1):
            test_sample = test_batch[0:num_examples_to_generate, :, :, :]
            
        generate_images(model, 0, test_sample)
    return model

We have a very biased data set, so we'll check to see if reweighting the training and test set improves things.

In [None]:
base_model = load_or_train_model(train_images,test_images,"base",50)

To test our CVAE, we will take a test example and see how well the autoencoder corrects the image.

In [None]:
resampled = double_filtered_characters.groupby("character").sample(n=150,replace=True)
resampled_train_images,resampled_test_images,resampled_train_df,resampled_test_df = split(resampled,tiles)
resampled_model = load_or_train_model(resampled_train_images,resampled_test_images,"resampled",50)

We could always increase the sameple size for the reweighted train/test set, but the initial results shown below are not promising.

In [None]:
test_df[test_df["character"] == "8"].head()

In [None]:
test_df[test_df["character"] == "N"].head()

In [None]:
index = 24

print(test_df.loc[index,"character"])

fig = plt.figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(131)
ax.imshow(test_images[index,:,:,0])

ax = fig.add_subplot(132)
a,b = base_model.encode(test_images[index:index+1,:,:,:])
z = base_model.reparameterize(a,b)
x = base_model.decode(z,apply_sigmoid=True)
ax.imshow(x[0,:,:,0])

ax = fig.add_subplot(133)
a,b = resampled_model.encode(test_images[index:index+1,:,:,:])
z = resampled_model.reparameterize(a,b)
x = resampled_model.decode(z,apply_sigmoid=True)
ax.imshow(x[0,:,:,0])

plt.show()

# How well is our CVAE able to separate the characters?

In [None]:
def characters_to_latent(df,images,model):
    assert isinstance(df,pd.DataFrame)
    mu = []
    sigma = []

    for index in range(images.shape[0]):
        print(index,end="\r")

        original = images[index:index+1,:,:,:]

        a,b = model.encode(original)
        z = model.reparameterize(a,b)

        mu.append(float(z[0][0]))
        sigma.append(float(z[0][1]))

    df2 = pd.DataFrame({"mu":mu,"sigma":sigma})
    return pd.concat([df,df2],axis=1)

In [None]:
resampled_latent_df = characters_to_latent(resampled_test_df,resampled_test_images,resampled_model)
latent_df = characters_to_latent(test_df,test_images,base_model)

As a quick test, how well does our CVAE differentiate numbers?

In [None]:
of_interest = [chr(ord('0') + i) for i in range(10)]
m = latent_df["character"].isin(of_interest)

plt.figure(figsize=(10, 10))
sns.scatterplot(x='mu', y='sigma', hue='character', data=latent_df[m])
plt.show()

In [None]:
clf = neighbors.KNeighborsClassifier(10, weights='uniform')
clf.fit(latent_df[["mu","sigma"]], latent_df["character"])

In [None]:
ideal_df = latent_df.groupby("character")[["mu","sigma"]].median().reset_index()
ideal_df

The following code seems to be easier to follow if everything is in dense format (as opposed to sparse).

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
ideal_as_1hot = enc.fit_transform(ideal_df[["character"]]).todense()

In [None]:
probabilities = clf.predict_proba(ideal_df[["mu","sigma"]])

In [None]:
most_likely = np.multiply(ideal_as_1hot,probabilities)
ideal_df["likelyhood"] = np.amax(most_likely,1)

# Which characters are we most confident about?

In [None]:
ideal_df.sort_values("likelyhood").tail(15)

In [None]:
def generate_character(model,x,y):
    s = np.array([x,y])
    s = s.reshape(1,2)
    
    remapped = model.decode(s,apply_sigmoid=True)

    a = (np.array(remapped)*255).astype(np.uint8)
    b = a.reshape((a.shape[1],a.shape[2]))
    ret2,th2 = cv2.threshold(b,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    th2 = 255 - th2
    return th2

def generate_ideal(model,ideal_df,ch):
    r = ideal_df[ideal_df["character"] == ch]
    return generate_character(model,r["mu"],r["sigma"])

Our CVAE handles '4' very well. We have identified some characters which according to our model are 4 but Tessearct thinks differently.

In [None]:
_ = plt.imshow(generate_ideal(base_model,ideal_df,"T"))

Now look at some of the raw characters. Start with characters we think are likely to be 4, but tesseract doesn't.

In [None]:
ideal_df[ideal_df["character"] == "4"]

In [None]:
actual = enc.transform(latent_df[["character"]]).todense()
probabilities = clf.predict_proba(latent_df[["mu","sigma"]])
latent_df["p"] = probabilities[:,15]

We see that when Tessearct thinks a character is 4, 77% of the time we agree. However, when Tessearct thinks a character is k, 60% we think the character is actaully 4.

In [None]:
latent_df.head()

In [None]:
latent_df.groupby("character")["p"].mean().to_frame().sort_values("p",ascending=False).head()

In [None]:
m = results_df["character"] == "k"
results_df[m].sort_values("p",ascending=False).head()

In [None]:
img = cv2.imread("/home/ggdhines/bear/Bear-AG-29-1940-01-39_ocr_ready.png",0)
img.shape

In [None]:
m1 = results_df["fname"] == "Bear-AG-29-1940-01-39_ocr_ready.png"
m2  = results_df["max prob"] >= 0.8
results_df[m1].sort_values("top").head()

We see that our classifcation is better than Tessearct's!

In [None]:
plot_character(results_df,4159)

We see that Tesseract is actually often mislabelling '4's as 'k's. Note that the above 4 is missing a fair bit since it overlaps with a grid line, yet we still estimate the probability of it being a '4' to be 83%/

# Size vs. likelyhood

In [None]:
df2 = results_df.groupby("character").size().reset_index()
df3 = ideal_df.merge(df2,on="character")

fig = plt.figure(num=None, figsize=(6, 6), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
df3.plot.scatter(x=0,y="likelyhood",ax=ax)
plt.show()

# There is surprisingly little correlation between Tessearct's confidence and ours
### (Remeber that we filtered to only includes tiles which Tesseract had a confidence of at least 95)

In [None]:
actual = enc.transform(latent_df[["character"]]).todense()
probabilities = clf.predict_proba(latent_df[["mu","sigma"]])
latent_df["max prob"] = np.amax(probabilities,axis=1)

_ = latent_df.plot.scatter(x="confidence",y="max prob")

# What Tiles are we least certain about?

In [None]:
most_likely = np.argmax(probabilities,axis=1)
inverse = [c[-1] for c in enc.get_feature_names()]
most_likely = [inverse[i] for i in most_likely]
latent_df["most_likely"] = most_likely

latent_df.sort_values("max prob").head(10)

# Here Tesseract is completely correct and we're not. So what happened?

In [None]:
latent_df.to_csv("/home/ggdhines/PycharmProjects/historical-transcriptions/dataframes/latent1.cvs")
ideal_df.to_csv("/home/ggdhines/PycharmProjects/historical-transcriptions/dataframes/ideal.cvs")

In [None]:
plot_character(latent_df,582)

In [None]:
ideal_df[ideal_df["character"].isin(["N","\""])]

In [None]:
of_interest = ["N","\""]
m = latent_df["character"].isin(of_interest)

plt.figure(figsize=(10, 10))
sns.scatterplot(x='mu', y='sigma', hue='character', data=latent_df[m])
plt.show()

In [None]:
u = ideal_df.loc[ideal_df["character"] == "2",["mu","sigma"]].values[0]

m = results_df["character"] == "2"
df2 = results_df[m]
d = df2[["mu","sigma"]].values

distance = np.sqrt((d[:,0]-u[0])**2 + (d[:,1]-u[1])**2)
df2["distance"] = distance

df2.sort_values("distance",ascending=False).head(10)

In [None]:
_ = plt.imshow(test_images[1614,:,:,:])

# So why is Z so bad? Relatively few tiles. How good are they?

* with less than 100 tiles, knn will underestimate the likelyhood of Z. But the CVAE seems to have a hard differentiating Z from E.

In [None]:
m = results_df["character"] == "Z"
results_df[m].head()

In [None]:
_ = plt.imshow(test_images[3627,:,:,:])

Suppose we have two tiles. Both have low max probability. Tesseract identifies the first as being for a character which we have a high confidence for

In [None]:
ideal_df.sort_values("likelyhood",ascending=False).tail()

In [None]:
m = latent_df["character"] == "%"
latent_df[m].sort_values("max prob").head()

In [None]:
_ = plt.imshow(test_images[3066,:,:,:])