# Compute GPT4 embeddings for essays

This is a helper notebook to pre-process the essays. It is used to pre-compute the GPT-4 embeddings from OpenAI. The dataset used for the experiments was from the competition [The Hewlett Foundation: Automated Essay Scoring](https://www.kaggle.com/competitions/asap-aes/data).

We don't distribute the data, but the data can be obtained from Kaggle and downloaded there.


In [16]:
import numpy as np
import pandas as pd
import os
from textblob import TextBlob
from openai import OpenAI

ESSAY_SET = 2
client = OpenAI(api_key=os.environ["OPEN_AI_API_KEY"])


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [17]:
df = pd.read_csv(
    "./training_set_rel3.tsv",
    sep="\t",
    encoding="ISO-8859-1",
    usecols=["essay_id", "essay_set", "essay", "domain1_score", "domain2_score"],
)

df = df[df["essay_set"] == ESSAY_SET]

In [18]:
essay_id, essay_set, essay, domain1_score, domain2_score = df.essay_id.tolist(), df.essay_set.tolist(), df.essay.tolist(), df.domain1_score.tolist(), df.domain2_score.tolist()

In [19]:
from nltk.tokenize import sent_tokenize

def get_features(essay_id, essay_set, essay, domain1_score, domain2_score):
    text = essay
    sentences = sent_tokenize(text)
    embs = [get_embedding(sent) for sent in sentences]
    return {
        "essay_id": essay_id,
        "essay_set": essay_set,
        "essay": essay,
        "domain1_score": domain1_score,
        "domain2_score": domain2_score,
        "embs": embs
    }

args = list(zip(essay_id, essay_set, essay, domain1_score, domain2_score))[0]
len(get_features(*args)["embs"])

19

In [20]:
from pqdm.processes import pqdm

args = list(zip(essay_id, essay_set, essay, domain1_score, domain2_score))
results = pqdm(args, get_features, n_jobs=8, argument_type="args")

QUEUEING TASKS | :   0%|          | 0/1800 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1800 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1800 [00:00<?, ?it/s]

In [21]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score,embs
0,2978,2,Certain materials being removed from libraries...,4,4.0,"[[0.032874464988708496, 0.062107767909765244, ..."
1,2979,2,Write a persuasive essay to a newspaper reflec...,1,1.0,"[[0.02395753376185894, -0.033956337720155716, ..."
2,2980,2,Do you think that libraries should remove cert...,2,3.0,"[[0.006172113120555878, 0.005909167230129242, ..."
3,2981,2,"In @DATE1's world, there are many things found...",4,4.0,"[[0.027020549401640892, 0.01217526663094759, 0..."
4,2982,2,In life you have the 'offensive things'. The l...,4,4.0,"[[0.02435912750661373, 0.06014935299754143, 0...."


In [22]:
df.to_csv(
    f"./essay_set_{ESSAY_SET}_and_gpt_sent_embs.tsv",
    sep="\t",
    encoding="ISO-8859-1",
)

In [23]:
df = pd.read_csv(
    f"./essay_set_{ESSAY_SET}_and_gpt_sent_embs.tsv",
    sep="\t",
    encoding="ISO-8859-1",
    usecols=["essay_id", "essay_set", "essay", "domain1_score", "domain2_score", "embs"],
)

In [24]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score,embs
0,2978,2,Certain materials being removed from libraries...,4,4.0,"[[0.032874464988708496, 0.062107767909765244, ..."
1,2979,2,Write a persuasive essay to a newspaper reflec...,1,1.0,"[[0.02395753376185894, -0.033956337720155716, ..."
2,2980,2,Do you think that libraries should remove cert...,2,3.0,"[[0.006172113120555878, 0.005909167230129242, ..."
3,2981,2,"In @DATE1's world, there are many things found...",4,4.0,"[[0.027020549401640892, 0.01217526663094759, 0..."
4,2982,2,In life you have the 'offensive things'. The l...,4,4.0,"[[0.02435912750661373, 0.06014935299754143, 0...."
