# Forgot to add markdown describing what this will do

In [None]:
# Install and import packages

# %conda install pandas numpy emnist matplotlib

# Import all the things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import emnist
from hashlib import sha1


In [None]:
# Load the data, and reshape it into a 28x28 array

# The size of each image is 28x28
size = 28 

# Extract the training split as images and labels
image, label = emnist.extract_training_samples('byclass')

# Add columns for each pixel value (28x28 = 784 columns)
raw_train = pd.DataFrame()



# Add a column showing the label
raw_train['label'] = label

# Add a column with the image data as a 28x28 array
raw_train['image'] = list(image)


# Repeat for the test split
image, label = emnist.extract_test_samples('byclass')
raw_test = pd.DataFrame()
raw_test['label'] = label
raw_test['image'] = list(image)




In [None]:
# We can plot individual images using matplotlib
plt.imshow(raw_train['image'][0], cmap='gray')
plt.show() # Show the plot (optional with a single image)

In [None]:
# Get the first row for each label
firsts = raw_train.groupby('label').first().reset_index()

# Build a plot with the first image for each label
fig, ax = plt.subplots(7, 10, figsize=(10, 7))
for i in range(62):
    ax[i//10, i%10].imshow(firsts['image'][i], cmap='gray')
    ax[i//10, i%10].axis('off')
    ax[i//10, i%10].set_title(firsts['label'][i])

# To-do

- [x] Add numerical var
- [ ] Numerical: missing, outlier, out-of-bounds
- [ ] Labels: missing(Null, None, "", " "), name that number, double-struck
- [ ] Image: zeroed, null? dimensions?
- [x] Image: add noise
- [x] Image: flip horizonally
- [x] Duplicated rows

In [None]:
# Now let's mess up the data a bit

# Percent of the time something dirty happens
pct = 0.01 

# Copy the splits into new dataframes to mess up
dirty_train = raw_train.copy()
dirty_test  = raw_test.copy()

# Add a column for previous prediction score
dirty_train['predict'] = np.random.rand(dirty_train.shape[0])
dirty_test['predict']  = np.random.rand(dirty_test.shape[0])

# Add a column for a hash of the images (should make it easier to compare them)
dirty_train['image_hash'] = dirty_train['image'].apply(lambda x: sha1(x.tobytes()).hexdigest())
dirty_test['image_hash']  =  dirty_test['image'].apply(lambda x: sha1(x.tobytes()).hexdigest())

# TEMPLATE: For each row, XX% of the time, randomly apply method()
# df['column'] = dirty_train['column'].apply(lambda x: method(x) if np.random.rand() < 0.XX else x)

# For each row, 1% of the time, duplicate the row
dirty_train = pd.concat([dirty_train, dirty_train.sample(frac=0.01)])
dirty_test  = pd.concat([dirty_test,   dirty_test.sample(frac=0.01)])

# For each row, 1% of the time, zero out the image array
dirty_train['image'] = dirty_train['image'].apply(lambda x: np.zeros((size, size)) if np.random.rand() < 0.01 else x)
dirty_test['image']  =  dirty_test['image'].apply(lambda x: np.zeros((size, size)) if np.random.rand() < 0.01 else x)

# For each row, 1% of the time, add/subtract 1 to the predict column
dirty_train['predict'] = dirty_train['predict'].apply(lambda x: x + 1 if np.random.rand() < 0.005 else x)
dirty_test['predict']  =  dirty_test['predict'].apply(lambda x: x + 1 if np.random.rand() < 0.005 else x)
dirty_train['predict'] = dirty_train['predict'].apply(lambda x: x - 1 if np.random.rand() < 0.005 else x)
dirty_test['predict']  =  dirty_test['predict'].apply(lambda x: x - 1 if np.random.rand() < 0.005 else x)




# For each row, randomly decide whether to apply a random noise
# dirty_train['image'] = dirty_train['image'].apply(lambda x: x + np.random.rand(size, size) if np.random.rand() < 0.1 else x)

# For each row, randomly decide whether to flip the image horizontally
#dirty_train['image'] = dirty_train['image'].apply(lambda x: np.flip(x, axis=1) if np.random.rand() < 0.1 else x)


In [None]:
# Scratch cell
# Capture an old copy of dirty test (run once manually)
# dirty_last=dirty_test.copy()

# last_hash = sha1(dirty_test.values.tobytes()).hexdigest()
print(last_hash, sha1(dirty_test.values.tobytes()).hexdigest())

# Check dupes
# merged = dirty_last.merge(dirty_test, indicator=True, how='outer')
# merged.loc = [merged['_merge'] != 'both']

In [None]:
# Plot the first image
plt.imshow(dirty_train['image'][0], cmap='gray')

# Cleaning

The changes above were applied randomly, so we'll need to find them and make a plan to fix them.

- [ ] Create a column to identify whether each row came from *train* or *test*
- [ ] (optional) Merge the data into a single
- [ ] Explore the data to understand what's in it
- [ ] List potential data issues to fix
- [ ] Create a friendlier column for image labels

In [None]:
# Let's start cleaning!

# Just going to leave this here in case anyone finds it helpful
LABELS = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 
          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# Add a column showing which split (train vs test) each row came from
raw_train['split'] = 'train'
raw_test['split'] = 'test'

# Add a column for a hash of the images (should make it easier to compare them)
dirty_train['image_hash'] = dirty_train['image'].apply(lambda x: sha1(x.tobytes()).hexdigest())
dirty_test['image_hash']  =  dirty_test['image'].apply(lambda x: sha1(x.tobytes()).hexdigest())