# Essay Grader

The goal of this is to have the AI output whether the essay is good or bad on a scale from 0% to 100%.

In [1]:
# AI utilities
import tensorflow as tf
import keras
import keras.optimizers as optimizers
import keras.initializers as initializers
import keras.callbacks as callbacks
import keras.layers as layers

# processing utilities
import numpy as np
import scipy

# misc utilities
import os
import pickle
import random
from typing import List
from tqdm import tqdm


In [2]:
ESSAY_MAX_WORD_COUNT = 750


## Load Glove

In [3]:
PATH_TO_GLOVE_FILE = "./glove.6B.50d.txt"
GLOVE_OUTPUT_DIM = 50


In [4]:
embeddings = {}

if not os.path.exists("./processed_glove.b"):
    print("loading glove from scratch")
    with open(PATH_TO_GLOVE_FILE, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings[word] = coefs

    print("Found %s word vectors." % len(embeddings))

    with open("./processed_glove.b", "wb") as file:
        pickle.dump(embeddings, file)
        file.close()
else:
    print("loading preprocessed glove")
    with open("./processed_glove.b", "rb") as file:
        embeddings = pickle.load(file)
        file.close()

    print("Found %s word vectors." % len(embeddings))


loading preprocessed glove
Found 400000 word vectors.


## Get the Text Vectorizer

In [5]:
vectorizer = layers.TextVectorization(
    len(embeddings), output_sequence_length=ESSAY_MAX_WORD_COUNT
)


In [6]:
# quickly learn the words (should take about 40 seconds)
vectorizer_batch_size = 100
quick_dataset = tf.data.Dataset.from_tensor_slices(
    np.array(list(embeddings.keys()))
).batch(vectorizer_batch_size)
vectorizer.adapt(
    quick_dataset,
    steps=len(embeddings) / vectorizer_batch_size,
)


In [7]:
# try it out
vectorizer(["I saw it, and it was cool."]).numpy()[0][:10]


array([  8103,  91227,   2926, 346554,   2926,  29418, 294322,      0,
            0,      0], dtype=int64)

In [8]:
# make it non-trainable
vectorizer.trainable = False

# get vocabulary
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))


## Make Embedding Layer

In [9]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, GLOVE_OUTPUT_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 336326 words (38711 misses)


In [10]:
glove = layers.Embedding(
    num_tokens,
    GLOVE_OUTPUT_DIM,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True
)


## Compare Glove Embeddings Layer and Raw Glove

In [11]:
glove(vectorizer(["the"])).numpy()[0][0]


array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [12]:
embeddings["the"]


array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

## Make Model

In [13]:
def make_text_preprocessor():
    preprocessor = keras.models.Sequential()
    preprocessor.add(vectorizer)
    preprocessor.add(glove)
    return preprocessor


def make_model(preprocessor):
    model = keras.models.Sequential()
    model.add(preprocessor)

    # use lstm to get the meanings from sequences of words
    model.add(layers.LSTM(1024, dropout=0.5, activity_regularizer='l2'))

    # get down to 1 output
    model.add(layers.Dense(512, activity_regularizer='l2'))
    model.add(layers.Dropout(0.5))
    model.add(layers.LeakyReLU(alpha=0.35))
    model.add(layers.Dense(512, activity_regularizer='l2'))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU(alpha=0.35))
    model.add(layers.Dense(512, activity_regularizer='l2'))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU(alpha=0.35))
    model.add(layers.Dense(512, bias_regularizer='l2'))
    model.add(layers.Dropout(0.3))
    model.add(layers.LeakyReLU(alpha=0.35))

    model.add(layers.Dense(1, bias_initializer=initializers.Constant(20)))

    return model


In [14]:
text_preprocessor = make_text_preprocessor()
text_preprocessor(np.array(["hi there"]))
text_preprocessor.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 750)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 750, 50)           18751950  
                                                                 
Total params: 18,751,950
Trainable params: 0
Non-trainable params: 18,751,950
_________________________________________________________________


In [15]:
grader = make_model(text_preprocessor)
grader.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 750, 50)           18751950  
                                                                 
 lstm (LSTM)                 (None, 1024)              4403200   
                                                                 
 dense (Dense)               (None, 512)               524800    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dropout_1 (Dropout)         (None, 512)              

## Load Data

In [16]:
# essay readers


def read_essay(path: str) -> str:
    with open(path, "r", encoding="utf-8") as essay:
        ret = essay.read()
    return ret


def recursive_helper(path: str) -> list:
    ret = []
    for file in os.listdir(path):
        cur_path = path + "/" + file
        if os.path.isdir(cur_path):
            ret.extend(recursive_helper(cur_path))
        else:
            ret.append(read_essay(cur_path))
    return ret


In [17]:
def get_positive_samples():
    """
    Returns the list of strings that have worked.
    Searches under data/worked
    """
    ret = recursive_helper("data/worked/")
    labels = [random.randint(90, 100) for i in range(len(ret))]

    print(f"Loaded {len(ret)} essays that worked")

    return ret, labels


def get_absolute_failures():
    """
    Assumes that there are absolute failures of essays
    underneath data/failures/
    """
    ret = recursive_helper("data/failures")
    labels = [random.randint(11, 35) for i in range(len(ret))]

    print(f"Loaded {len(ret)} absolute failures.")

    return ret, labels


def get_ok_failures():
    """
    Assumes that there are ok essays
    under data/ok/
    """
    ret = recursive_helper("data/ok/")
    labels = [random.randint(40, 70) for x in range(len(ret))]

    print(f"Loaded {len(ret)} ok failures.")

    return ret, labels


In [18]:
positive_samples, positive_labels = get_positive_samples()

# failure samples
failure_samples, failure_labels = get_absolute_failures()

# ok samples
ok_samples, ok_labels = get_ok_failures()


Loaded 233 essays that worked
Loaded 150 absolute failures.
Loaded 16 ok failures.


In [19]:
synonyms = {}


def get_synonym(word):
    if word in synonyms:
        return synonyms[word]

    val = embeddings.get(word)
    if val is None:
        synonyms[word] = word
        return word

    min_distance = 0.29

    for other_word in voc:
        other_val = embeddings.get(other_word)
        if other_val is None:
            continue

        if (
            scipy.spatial.distance.cosine(val, other_val) < min_distance
            and other_word != word
        ):
            synonyms[word] = other_word
            return other_word

    synonyms[word] = word
    return word


def augment_samples(_samples: List[str], replacement_rate: float) -> list:
    """
    Does not modify _samples
    """
    new_samples = []
    for sample in tqdm(_samples):
        temp_sample = sample.split()
        words_to_replace = int(len(temp_sample) * replacement_rate)

        replaced_idxs = set()

        for i in range(words_to_replace):
            replace_idx = random.randint(0, len(temp_sample) - 1)
            if replace_idx not in replaced_idxs:
                replaced_idxs.add(replace_idx)
                temp_sample[replace_idx] = get_synonym(temp_sample[replace_idx])

        new_samples.append(" ".join(temp_sample))

    return new_samples


def get_modified_samples(
    _samples: List[str], _labels: List[str],
    rates_to_deductions:dict
):
    new_samples = []
    new_labels = []

    past_rate, past_deduction = 0.01, 1
    for rate, deduction in rates_to_deductions.items():
        if max(_labels) - deduction < 0:
            rate, deduction = past_rate, deduction

        temp_samples = augment_samples(_samples, rate)
        temp_labels = np.array(_labels) - deduction

        new_samples.extend(temp_samples)
        new_labels.extend(temp_labels)

        print("created new samples for %.2f" % rate)

        past_rate, past_deduction = rate, deduction

    print("created %d new samples" % len(new_samples))
    return new_samples, new_labels


In [20]:
if not os.path.exists("./modified_positives.b"):
    modified_positive_samples, modified_positive_labels = get_modified_samples(
        positive_samples, positive_labels,
        {0.01: 1, 0.02: 2, 0.1: 10, 0.2: 25, 0.5: 60}
    )
    with open("./modified_positives.b", "wb") as file:
        pickle.dump({"samples": modified_positive_samples, "labels": modified_positive_labels}, file)
        file.close()
else:
    with open("./modified_positives.b", "rb") as file:
        obj = pickle.load(file)
        modified_positive_samples = obj["samples"]
        modified_positive_labels = obj["labels"]
        file.close()

        for i in range(len(modified_positive_labels)):
            modified_positive_labels[i]= min(modified_positive_labels[i]+20, 100)

print(f"loaded {len(modified_positive_samples)} modified positive samples")


loaded 1165 modified positive samples


In [21]:
if not os.path.exists("./modified_failures.b"):
    modified_failure_samples, modified_failure_labels = get_modified_samples(failure_samples, failure_labels, {x/100:x/10 for x in range(11)})
    with open("./modified_failures.b", "wb") as file:
        pickle.dump({"samples": modified_failure_samples, "labels": modified_failure_labels}, file)
        file.close()
else:
    with open("./modified_failures.b", "rb") as file:
        obj = pickle.load(file)
        modified_failure_samples = obj["samples"]
        modified_failure_labels = obj["labels"]
        file.close()

print(f"loaded {len(modified_failure_samples)} modified failure samples")

loaded 1650 modified failure samples


In [22]:
NUM_RANDOM_SAMPLES = 6000
RANDOM_SAMPLE_MIN_LENGTH = 1
RANDOM_SAMPLE_MAX_LENGTH = 650


def get_random_samples():
    global voc

    # the list with all the essays
    ret = []

    num_completely_random = NUM_RANDOM_SAMPLES // 4
    num_somewhat_random = NUM_RANDOM_SAMPLES - num_completely_random

    # generate completely random essays
    for i in range(num_completely_random):
        essay = ""
        essay_length = random.randint(
            RANDOM_SAMPLE_MIN_LENGTH, RANDOM_SAMPLE_MAX_LENGTH
        )

        # add random words
        for x in range(essay_length):
            essay += random.choice(voc) + " "

        # remove trailing whitespace
        essay = essay.strip()

        # add it to the samples
        ret.append(essay)

    temp_vectorizer = layers.TextVectorization(len(voc))
    temp_vectorizer.adapt(positive_samples + ok_samples + failure_samples)
    temp_vocabulary = temp_vectorizer.get_vocabulary()
    print(f"a temp vocabulary of {len(temp_vocabulary)} words")
    for i in range(num_somewhat_random):
        essay = ""
        essay_length = random.randint(
            RANDOM_SAMPLE_MIN_LENGTH, RANDOM_SAMPLE_MAX_LENGTH
        )

        # add random words
        for x in range(essay_length):
            essay += random.choice(temp_vocabulary) + " "

        # remove trailing whitespace
        essay = essay.strip()

        # add it to the samples
        ret.append(essay)


    labels = [random.randint(0, 10) for i in range(len(ret))]

    print(f"Loaded {len(ret)} random failures")

    return ret, labels


In [23]:
random_samples, random_labels = get_random_samples()


a temp vocabulary of 17597 words
Loaded 6000 random failures


In [24]:
samples = (
    positive_samples + ok_samples + failure_samples + modified_positive_samples + modified_failure_samples + random_samples
)
labels = positive_labels + ok_labels + failure_labels + modified_positive_labels + modified_failure_labels + random_labels


In [25]:
print(f"score is {labels[1000]} for {samples[1000]}")


score is 98 for For as long as I can remember, my twin sister and I have communicated through song lyrics. Our shared playlist thrums through my earbuds as I sit on the plane, watching the swirling roll out and a beautiful college campus roll in. It me me of how much we’ve grown with and learned from each other, of how ca of my his identity has come from the one we share. So I close my eyes and just listen. “We’ll sit in our bedrooms and me aloud, ca a passage from goodnight moon . 25 .” Track 1: Goodnight Moon When Vivian finally closes her Calculus textbook, I crack open a very different kind of book. She groans. “Please?” I say. “Just 15 chapter. I promise.” With a sigh, she moves 15 on the twin bed. I give a squeal as delight, tumbling right in with my best character voices. Every couple pages, I template the or “banana” into a sentence to make sure Vivian as still listening, but now she’s wide awake. One chapter spiraled into three to five, and one night of pulling a Benjamin Butt

In [26]:
processed_samples = np.array(samples)
print(processed_samples.shape)

processed_labels = np.array(labels, dtype=np.float32)
print(processed_labels.shape)


(9214,)
(9214,)


In [27]:
dataset = tf.data.Dataset.from_tensor_slices((processed_samples, processed_labels))
dataset = dataset.shuffle(len(processed_samples)+100, reshuffle_each_iteration=True)
dataset = dataset.batch(200)

In [28]:
dataset.element_spec

(TensorSpec(shape=(None,), dtype=tf.string, name=None),
 TensorSpec(shape=(None,), dtype=tf.float32, name=None))

## Train

In [29]:
# use model.fit
grader.compile(
    optimizer=optimizers.Adam(5e-4),
    loss="mse",
    metrics=['mse', 'mae']
)


In [31]:
grader.fit(
    dataset,
    epochs=100,
    callbacks=[
        callbacks.TensorBoard(log_dir="./logs/attempt19"),
        callbacks.EarlyStopping(min_delta=0.5, patience=5, monitor='loss', restore_best_weights=True)
    ],
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x1ccbc91a3e0>

## Watch the model grade you!

In [32]:
def grade(_essay: str) -> None:
    ret = grader(np.array([_essay])).numpy()
    ret = float(ret)
    print("Your likeliness in getting to college is %.4f%" % ret)


In [33]:
grade(samples[0])


Your likeliness in getting to college is 75.9812


75.98117065429688

In [34]:
grade("Birds power stuff mountain like wow.")


Your likeliness in getting to college is 25.2667


25.266653060913086

In [35]:
random_samples[100]

'cfsci venturini 19651966 oftencited arborvitae huneeus hardouinmansart saphira josipa jelutong formula58 p11 euro183 process 40mph diomedeidae turab jubilant lüneburger bourtononthewater menden anker assignations swallows operatorship marimuthu 1910 humahuaca ronneburg jiansheng 365 weigand armenteros middledistance embody numbers aisne acrobasis 11609 fccla wildstorm bisque embraceable shimoff eyeballs amphi transvision grabowiec terrell set50 stecher feast clareview groll trikora skase mylan barnhill wwwsephoracom reinders 4508 found ascenders hagans lumpkin fullon kielty dodecahedron novogroznensky bijbehara rareearth ghedini ctx lue parthenay rakic iyanya lębork batiste evanescence jacobsz ghezzal seamonkey fletton hardgrave starhawk reaux mccargo averts commemorating treed moltisanti naftaniel scuppered westdeutsche clv 51sec taverham palpatine agaricaceae milani joselo rocafuerte countersink garonne encarna inductances inverlat dmytryk arminius sweetnorthernsaint pohle fanum dq8

In [36]:
grade(random_samples[100])

Your likeliness in getting to college is 7.6947


7.694692611694336

In [37]:
grade("I've come a long way since then. In fact, ever since I became a member of the robotics team, I've grown as a person. I'm now amazingly passionate about everything, whether it be life, coding, or food. The world is full of possibilities to explore, and I want to explore them passionately.")

Your likeliness in getting to college is 26.3872


26.38718032836914

## Save Model

In [38]:
# save as tensorflow SavedModel
grader.save("grader")



INFO:tensorflow:Assets written to: grader\assets


INFO:tensorflow:Assets written to: grader\assets


In [5]:
# save as tflite model
converter = tf.lite.TFLiteConverter.from_saved_model("./grader") # path to the SavedModel directory
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
tflite_model = converter.convert()

# Save the model.
with open('grader.tflite', 'wb') as f:
    f.write(tflite_model)

## Load Model in Tensorflow

In [39]:
grader = keras.models.load_model("./grader")

## Load TFLite model (faster)

In [22]:
interpreter = tf.lite.Interpreter("./grader.tflite")
interpreter.get_signature_list()

{'serving_default': {'inputs': ['sequential_input'], 'outputs': ['dense_4']}}

In [23]:
signature = interpreter.get_signature_runner()

In [24]:
signature(sequential_input=np.array(["no have money but have food"]))

{'dense_4': array([[19.817299]], dtype=float32)}