# Essay Grader

The goal of this is to have the AI output whether the essay is good or bad on a scale from 0% to 100%.

In [1]:
# AI utilities
import tensorflow as tf
import keras
import keras.layers as layers

# processing utilities
import numpy as np
from sklearn.model_selection import train_test_split

# misc utilities
import os
import pickle
import random


In [2]:
ESSAY_MAX_WORD_COUNT = 1250


## Load Glove

In [3]:
PATH_TO_GLOVE_FILE = "./glove.6B.100d.txt"
GLOVE_OUTPUT_DIM = 100


In [4]:
embeddings = {}

if not os.path.exists("./processed_glove.b"):
    print("loading glove from scratch")
    with open(PATH_TO_GLOVE_FILE) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings[word] = coefs

    print("Found %s word vectors." % len(embeddings))

    with open("./processed_glove.b", "wb") as file:
        pickle.dump(embeddings, file)
        file.close()
else:
    print("loading preprocessed glove")
    with open("./processed_glove.b", "rb") as file:
        embeddings = pickle.load(file)
        file.close()

    print("Found %s word vectors." % len(embeddings))


loading preprocessed glove
Found 400000 word vectors.


## Get the Text Vectorizer

In [5]:
vectorizer = layers.TextVectorization(
    len(embeddings), standardize="lower", output_sequence_length=ESSAY_MAX_WORD_COUNT
)


2022-10-13 14:39:34.465105: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-10-13 14:39:34.465244: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (esuhsd-Latitude-3410): /proc/driver/nvidia/version does not exist
2022-10-13 14:39:34.467531: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# quickly learn the words (should take about 40 seconds)
vectorizer_batch_size = 10
quick_dataset = tf.data.Dataset.from_tensor_slices(
    np.array(list(embeddings.keys()))
).batch(vectorizer_batch_size)
vectorizer.adapt(quick_dataset, steps=len(embeddings) / vectorizer_batch_size)


In [7]:
# try it out
vectorizer(["I saw it, and it was cool."])


<tf.Tensor: shape=(1, 1250), dtype=int64, numpy=array([[214545,  80444,      1, ...,      0,      0,      0]])>

In [8]:
# make it non-trainable
vectorizer.trainable = False

# get vocabulary
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))


## Make Embedding Layer

In [9]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, GLOVE_OUTPUT_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 399998 words (2 misses)


In [10]:
glove = keras.layers.Embedding(
    num_tokens,
    GLOVE_OUTPUT_DIM,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)


## Compare Glove Embeddings Layer and Raw Glove

In [11]:
glove(vectorizer(["the"]))


<tf.Tensor: shape=(1, 1250, 100), dtype=float32, numpy=
array([[[-0.038194, -0.24487 ,  0.72812 , ..., -0.1459  ,  0.8278  ,
          0.27062 ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        ...,
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ]]], dtype=float32)>

In [12]:
embeddings["the"]


array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

## Make Model

In [13]:
def make_text_preprocessor():
    preprocessor = keras.models.Sequential()
    preprocessor.add(vectorizer)
    preprocessor.add(glove)
    return preprocessor


def make_model(preprocessor):
    model = keras.models.Sequential()
    model.add(layers.Input(preprocessor.output_shape[1:]))

    # slowly change from 100 to 25 numbers per word
    model.add(layers.Conv1D(75, 5, activation="relu"))
    model.add(layers.MaxPooling1D())
    model.add(layers.Conv1D(50, 5, activation="relu"))
    model.add(layers.MaxPooling1D())
    model.add(layers.Conv1D(25, 5, activation="relu"))
    model.add(layers.MaxPooling1D())

    # use lstm to get the meanings from sequences of words
    model.add(layers.LSTM(512))

    # get down to 1 output
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dropout(0.3))

    model.add(layers.Dense(1, activation="sigmoid"))

    return model


In [14]:
text_preprocessor = make_text_preprocessor()
text_preprocessor.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 1250)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 1250, 100)         40000200  
                                                                 
Total params: 40,000,200
Trainable params: 0
Non-trainable params: 40,000,200
_________________________________________________________________


In [15]:
grader = make_model(text_preprocessor)
grader.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 1246, 75)          37575     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 623, 75)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 619, 50)           18800     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 309, 50)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 305, 25)           6275      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 152, 25)         

## Load Data

In [16]:
# essay readers


def read_essay(path: str) -> str:
    with open(path, "r") as essay:
        ret = essay.read()
    return ret


def recursive_helper(path: str) -> list:
    ret = []
    for file in os.listdir(path):
        cur_path = path + "/" + file
        if os.path.isdir(cur_path):
            ret.extend(recursive_helper(cur_path))
        else:
            ret.append(read_essay(cur_path))
    return ret


In [17]:
def get_positive_samples():
    """
    Returns the list of strings that have worked.
    Searches under data/worked
    """
    ret = recursive_helper("data/worked/")
    labels = [random.randint(85, 100) / 100 for i in range(len(ret))]

    print(f"Loaded {len(ret)} essays that worked")

    return ret, labels


In [18]:
samples, labels = get_positive_samples()


Loaded 2 essays that worked


In [19]:
NUM_RANDOM_SAMPLES = 500
RANDOM_SAMPLE_MIN_LENGTH = 10
RANDOM_SAMPLE_MAX_LENGTH = 1000


def get_random_samples():
    global voc

    # the list with all the essays
    ret = []

    # generate random essays
    for i in range(NUM_RANDOM_SAMPLES):
        essay = ""
        essay_length = random.randint(
            RANDOM_SAMPLE_MIN_LENGTH, RANDOM_SAMPLE_MAX_LENGTH
        )

        # add random words
        for x in range(essay_length):
            essay += random.choice(voc) + " "

        # remove trailing whitespace
        essay = essay.strip()

        # add it to the samples
        ret.append(essay)

    labels = [0 for i in range(len(ret))]

    print(f"Loaded {len(ret)} random failures")

    return ret, labels


def get_absolute_failures():
    """
    Assumes that there are absolute failures of essays
    underneath data/failures/
    """
    ret = recursive_helper("data/failures")
    labels = [random.randint(0, 10) / 100 for i in range(len(ret))]

    print(f"Loaded {len(ret)} absolute failures.")

    return ret, labels


def get_ok_failures():
    """
    Assumes that there are ok essays (somewhere between 0.3 to 0.5)
    under data/ok/
    """
    ret = recursive_helper("data/ok/")
    labels = [random.randint(30, 50) / 100 for x in range(len(ret))]

    print(f"Loaded {len(ret)} ok failures.")

    return ret, labels


In [20]:
# random samples
_temp_samples, _temp_labels = get_random_samples()
samples.extend(_temp_samples)
labels.extend(_temp_labels)

# failure samples
_temp_samples, _temp_labels = get_absolute_failures()
samples.extend(_temp_samples)
labels.extend(_temp_labels)

# ok samples
_temp_samples, _temp_labels = get_ok_failures()
samples.extend(_temp_samples)
labels.extend(_temp_labels)


Loaded 500 random failures
Loaded 1 absolute failures.
Loaded 1 ok failures.


In [21]:
processed_samples = text_preprocessor(tf.constant(samples))
processed_samples.shape


2022-10-13 14:40:20.707516: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 252000000 exceeds 10% of free system memory.


TensorShape([504, 1250, 100])

In [22]:
processed_labels = np.array(labels)
processed_labels.shape


(504,)

## Train

In [23]:
# use model.fit
grader.compile(
    optimizer="adam",
    loss=keras.losses.BinaryCrossentropy(),
)


In [24]:
grader.fit(
    processed_samples,
    processed_labels,
    epochs=3,
    callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", verbose=1)],
    validation_split=0.15,
)


Epoch 1/3
Epoch 2/3
Epoch 2: early stopping


<keras.callbacks.History at 0x7f083f6c9bd0>

## Watch the model grade you!

In [25]:
def grade(essay: str) -> float:
    ret = grader(text_preprocessor(np.array([essay])))
    return ret * 100


In [31]:
grade(samples[0])


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00200112]], dtype=float32)>