In [18]:
import spacy
import pandas as pd
import random

In [21]:
from spacy.tokens import DocBin

After importing our requirements, we'll ingest the competition's training and test data into dataframes. Then, we'll convert necessary columns in our training dataframe to lists for processing. 

In [11]:
# Import training and test data provided by competition
training_data = pd.read_csv('learning-agency-lab-automated-essay-scoring-2/train.csv')
testing_data = pd.read_csv('learning-agency-lab-automated-essay-scoring-2/test.csv')

essay = training_data['full_text'].tolist()
score = training_data['score'].tolist()

Here, we'll load spacy's "large" pipeline.

In [12]:
# Load en_core_web_lg spacy pipeline
nlp = spacy.load("en_core_web_lg")

Let's create an empty list to hold our **doc** objects. Then, we're going to iterate through every essay and score in our training data, and for each, we'll assign truth values to the doc's cats attribute. We'll use the zip() method to pass (essay, score) tuples through nlp.pipe, which only accepts strings or tuples. 

This cell may take time to process - remember, we're passing 24,000 full essays through the **doc** pipe. 

In [17]:
docs = []
for doc, score in nlp.pipe(zip(essay, score), as_tuples=True):
    if score == 1:
        doc.cats['1'] = 1.0
        doc.cats['2'] = 0.0
        doc.cats['3'] = 0.0
        doc.cats['4'] = 0.0
        doc.cats['5'] = 0.0
        doc.cats['6'] = 0.0
    if score == 2:
        doc.cats['1'] = 0.0
        doc.cats['2'] = 1.0
        doc.cats['3'] = 0.0
        doc.cats['4'] = 0.0
        doc.cats['5'] = 0.0
        doc.cats['6'] = 0.0
    if score == 3:
        doc.cats['1'] = 0.0
        doc.cats['2'] = 0.0
        doc.cats['3'] = 1.0
        doc.cats['4'] = 0.0
        doc.cats['5'] = 0.0
        doc.cats['6'] = 0.0
    if score == 4:
        doc.cats['1'] = 0.0
        doc.cats['2'] = 0.0
        doc.cats['3'] = 0.0
        doc.cats['4'] = 1.0
        doc.cats['5'] = 0.0
        doc.cats['6'] = 0.0
    if score == 5:
        doc.cats['1'] = 0.0
        doc.cats['2'] = 0.0
        doc.cats['3'] = 0.0
        doc.cats['4'] = 0.0
        doc.cats['5'] = 1.0
        doc.cats['6'] = 0.0
    if score == 6:
        doc.cats['1'] = 0.0
        doc.cats['2'] = 0.0
        doc.cats['3'] = 0.0
        doc.cats['4'] = 0.0
        doc.cats['5'] = 0.0
        doc.cats['6'] = 1.0
    docs.append(doc)

In [19]:
random.shuffle(docs)
split = len(docs) * 90 // (90+10)
traindocs = docs[:split]
evaldocs = docs[split:]

In [22]:
train_docbin = DocBin(docs=traindocs)
eval_docbin = DocBin(docs=evaldocs)
train_docbin.to_disk('./train.spacy')
eval_docbin.to_disk('./eval.spacy')

In [23]:
textcat = nlp.add_pipe('textcat')
optimizer = textcat.create_optimizer