In [7]:
from typing import List
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import syllables
from collections import defaultdict
from pqdm.processes import pqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
import re
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import pandas as pd
import os
import spacy
from numpy.linalg import norm
from tqdm import tqdm

import os
from openai import OpenAI


client = OpenAI(api_key=os.environ["OPEN_AI_API_KEY"])
nlp = spacy.load("en_core_web_sm")
ESSAY_SET = 4

# Define QWK Metric functions

In [8]:
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the quadratic weighted kappa
    quadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the linear weighted kappa
    linear_weighted_kappa calculates the linear weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    linear_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = abs(i - j) / float(num_ratings - 1)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the kappa
    kappa calculates the kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            if i == j:
                d = 0.0
            else:
                d = 1.0
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def mean_quadratic_weighted_kappa(kappas, weights=None):
    """
    Calculates the mean of the quadratic
    weighted kappas after applying Fisher's r-to-z transform, which is
    approximately a variance-stabilizing transformation.  This
    transformation is undefined if one of the kappas is 1.0, so all kappa
    values are capped in the range (-0.999, 0.999).  The reverse
    transformation is then applied before returning the result.

    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
    kappa values

    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
    of weights that is the same size as kappas.  Weights are applied in the
    z-space
    """
    kappas = np.array(kappas, dtype=float)
    if weights is None:
        weights = np.ones(np.shape(kappas))
    else:
        weights = weights / np.mean(weights)

    # ensure that kappas are in the range [-.999, .999]
    kappas = np.array([min(x, .999) for x in kappas])
    kappas = np.array([max(x, -.999) for x in kappas])

    z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights
    z = np.mean(z)
    return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)


def weighted_mean_quadratic_weighted_kappa(solution, submission):
    predicted_score = submission[submission.columns[-1]].copy()
    predicted_score.name = "predicted_score"
    if predicted_score.index[0] == 0:
        predicted_score = predicted_score[:len(solution)]
        predicted_score.index = solution.index
    combined = solution.join(predicted_score, how="left")
    groups = combined.groupby(by="essay_set")
    kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups]
    weights = [group[1]["essay_weight"].irow(0) for group in groups]
    return mean_quadratic_weighted_kappa(kappas, weights=weights)

In [9]:
UNIVERSAL_NOUN_TAGS = set([u"NOUN", u"PRON", u"PROPN"])

ordered_transitions = [
    u"SS",
    u"SO",
    u"SX",
    u"S-",
    u"OS",
    u"OO",
    u"OX",
    u"O-",
    u"XS",
    u"XO",
    u"XX",
    u"X-",
    u"-S",
    u"-O",
    u"-X",
    u"--",
]


def dependency_mapping(dep: str) -> str:
    """Map dependency tag to entity grid tag.

    We consider the notation provided in :cite:`barzilay2008modeling`:

    +-----------+-----------------------------------+
    | EGrid Tag | Dependency Tag                    |
    +===========+===================================+
    | S         | nsub, csubj, csubjpass, dsubjpass |
    +-----------+-----------------------------------+
    | O         | iobj, obj, pobj, dobj             |
    +-----------+-----------------------------------+
    | X         | For any other dependency tag      |
    +-----------+-----------------------------------+

    :param dep: Dependency tag
    :type dep: string
    :return: EGrid tag
    :rtype: string
    """
    if dep in {u"nsubj", u"csubj", u"csubjpass", u"dsubjpass"}:
        return u"S"
    if dep in {u"iobj", u"obj", u"pobj", u"dobj"}:
        return u"O"

    return "X"


class EntityGrid(object):
    """Entity grid class.

    Class Entity Grid, creates an entity grid from a doc, which is output of
    applying spacy.nlp(text) to a text. Thus, this class depends on spacy
    module. It only supports 2-transitions entity grid.
    """

    def __init__(self, doc):
        """Construct EntityGrid object."""
        # Initialization
        entity_map = dict()
        entity_grid = dict()
        i = 1
        entity_map["s%d" % i] = []
        entity_features = {
            u"SS": 0,
            u"SO": 0,
            u"SX": 0,
            u"S-": 0,
            u"OS": 0,
            u"OO": 0,
            u"OX": 0,
            u"O-": 0,
            u"XS": 0,
            u"XO": 0,
            u"XX": 0,
            u"X-": 0,
            u"-S": 0,
            u"-O": 0,
            u"-X": 0,
            u"--": 0,
        }


        n_sent = len(list(doc.sents))

        # To get coherence measurements we need at least 2 sentences
        if n_sent < 2:
            raise RuntimeError(
                "Entity grid needs at least two sentences, found: {}".format(
                    n_sent
                )
            )

        # For each sentence, get dependencies and its grammatical role
        for sent in doc.sents:
            for token in sent:
                if token.pos_ in UNIVERSAL_NOUN_TAGS:
                    entity_map["s%d" % i].append(
                        (token.text.upper(), token.dep_)
                    )
                    if token.text.upper() not in entity_grid:
                        entity_grid[token.text.upper()] = [u"-"] * n_sent
            i += 1
            entity_map["s%d" % i] = []

        # Last iteration will create an extra element, so I remove it.
        entity_map.pop("s%d" % i)

        # Fill entity grid
        for i in range(n_sent):
            sentence = "s%d" % (i + 1)
            for entity, dep in entity_map[sentence]:
                if entity_grid[entity][i] == u"-":
                    entity_grid[entity][i] = dependency_mapping(dep)
                elif dependency_mapping(dep) == u"S":
                    entity_grid[entity][i] = dependency_mapping(dep)
                elif (
                    dependency_mapping(dep) == u"O"
                    and entity_grid[entity][i] == u"X"
                ):
                    entity_grid[entity][i] = dependency_mapping(dep)

        # Compute feature vector, we consider transitions of length 2
        total_transitions = (n_sent - 1) * len(entity_grid.keys())

        for entity in entity_grid:
            for i in range(n_sent - 1):
                # Transition type found (e.g. S-)
                transition = (
                    entity_grid[entity][i] + entity_grid[entity][i + 1]
                )

                # Adding 1 to transition count
                entity_features[transition] += 1

        for prob in entity_features:
            if total_transitions != 0:
                entity_features[prob] /= float(total_transitions)
            else:
                entity_features[prob] = 0.0

        self.__grid = entity_grid
        self.__n_sent = n_sent
        self.__prob = entity_features

    def get_ss_transitions(self) -> float:
        """Get SS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SS"]

    def get_so_transitions(self) -> float:
        """Get SO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SO"]

    def get_sx_transitions(self) -> float:
        """Get SX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SX"]

    def get_sn_transitions(self) -> float:
        """Get S- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"S-"]

    def get_os_transitions(self) -> float:
        """Get OS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OS"]

    def get_oo_transitions(self) -> float:
        """Get OO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OO"]

    def get_ox_transitions(self) -> float:
        """Get OX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OX"]

    def get_on_transitions(self) -> float:
        """Get O- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"O-"]

    def get_xs_transitions(self) -> float:
        """Get XS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XS"]

    def get_xo_transitions(self) -> float:
        """Get XO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XO"]

    def get_xx_transitions(self) -> float:
        """Get XX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XX"]

    def get_xn_transitions(self) -> float:
        """Get X- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"X-"]

    def get_ns_transitions(self) -> float:
        """Get -S transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-S"]

    def get_no_transitions(self) -> float:
        """Get -O transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-O"]

    def get_nx_transitions(self) -> float:
        """Get -X transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-X"]

    def get_nn_transitions(self) -> float:
        """Get -- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"--"]

    def get_egrid(self) -> dict:
        """Return obtained entity grid (for debugging purposes).

        :return: entity grid represented as a dict
        :rtype: dict
        """
        return self.__grid

    def get_sentence_count(self) -> int:
        """Return sentence count obtained while processing.

        :return: Number of sentences
        :rtype: int
        """
        return self.__n_sent

In [10]:
def weighting_syntactic_role(entity_role: str) -> int:
    """Return weight given an entity grammatical role.

    Weighting scheme for syntactic role of an entity. This uses the heuristic
    from :cite:`guinaudeau2013graph`, which is:

    +-----------+--------+
    | EGrid Tag | Weight |
    +===========+========+
    | S         | 3      |
    +-----------+--------+
    | O         | 2      |
    +-----------+--------+
    | X         | 1      |
    +-----------+--------+
    | dash      | 0      |
    +-----------+--------+

    :param entity_role: Entity grammatical role (S, O, X, -)
    :type entity_role: string
    :return: Role weight
    :rtype: int
    """
    if entity_role == u"S":
        return 3
    elif entity_role == u"O":
        return 2
    elif entity_role == u"X":
        return 1

    return 0


def get_local_coherence(egrid: EntityGrid) -> [float, float, float, float]:
    """Get local coherence from entity grid.

    This method gets the coherence value using all the approaches described
    in :cite:`guinaudeau2013graph`. This include:

    * local_coherence_PU
    * local_coherence_PW
    * local_coherence_PACC
    * local_coherence_PU_dist
    * local_coherence_PW_dist
    * local_coherence_PACC_dist

    :param egrid: An EntityGrid object.
    :type egrid: EntityGrid
    :return: Local coherence based on different heuristics
    :rtype: tuple of floats
    """
    n_sent = egrid.get_sentence_count()

    # If entity grid is not valid
    if n_sent < 2:
        return (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

    PW = [[0] * n_sent for i in range(n_sent)]

    # Weight Matrix for PACC, syntactic information is accounted for by
    # integrating the edges of the bipartite graph
    W = [[0] * n_sent for i in range(n_sent)]

    grid = egrid.get_egrid()
    for entity in grid:
        for i in range(n_sent):
            for j in range(i + 1, n_sent):
                if grid[entity][i] != u"-" and grid[entity][j] != u"-":
                    PW[i][j] += 1
                    W[i][j] += weighting_syntactic_role(
                        grid[entity][i]
                    ) * weighting_syntactic_role(grid[entity][j])

    PU = [list(map(lambda x: x != 0, PWi)) for PWi in PW]

    local_coherence_PU = 0.0
    local_coherence_PW = 0.0
    local_coherence_PACC = 0.0
    for i in range(n_sent):
        local_coherence_PW += sum(PW[i])
        local_coherence_PU += sum(PU[i])
        local_coherence_PACC += sum(W[i])

    local_coherence_PW /= n_sent
    local_coherence_PU /= n_sent
    local_coherence_PACC /= n_sent

    # Weighting projection graphs
    PU_weighted = list(PU)
    PW_weighted = list(PW)
    PACC_weighted = list(W)
    for i in range(n_sent):
        for j in range(i + 1, n_sent):
            PU_weighted[i][j] = PU[i][j] / float(j - i)
            PW_weighted[i][j] = PW[i][j] / float(j - i)
            PACC_weighted[i][j] = W[i][j] / float(j - i)

    local_coherence_PU_dist = 0.0
    local_coherence_PW_dist = 0.0
    local_coherence_PACC_dist = 0.0
    for i in range(n_sent):
        local_coherence_PW_dist += sum(PW_weighted[i])
        local_coherence_PU_dist += sum(PU_weighted[i])
        local_coherence_PACC_dist += sum(PACC_weighted[i])

    local_coherence_PW_dist /= n_sent
    local_coherence_PU_dist /= n_sent
    local_coherence_PACC_dist /= n_sent
    return (
        local_coherence_PU,
        local_coherence_PW,
        local_coherence_PACC,
        local_coherence_PU_dist,
        local_coherence_PW_dist,
        local_coherence_PACC_dist,
    )

In [11]:
df = pd.read_csv(
    f"./training_set_rel3.tsv",
    sep="\t",
    encoding="ISO-8859-1",
    usecols=["essay_id", "essay_set", "essay", "domain1_score", "domain2_score"],
)

In [12]:
from collections import Counter

df = df[df.essay_set == ESSAY_SET]
Counter(df.domain1_score)

Counter({0: 311, 3: 253, 2: 570, 1: 636})

In [13]:
min_rating, max_rating = int(df.domain1_score.min()), int(df.domain1_score.max())

In [14]:
df_ft = pd.DataFrame({
    "prompt": df.essay.tolist(),
    "completion": df.domain1_score.tolist()})

df_ft.to_json(f"essay_set{ESSAY_SET}.jsonl", orient='records', lines=True)

In [15]:
df_ft.head()

Unnamed: 0,prompt,completion
0,The author concludes the story with this becau...,0
1,The narrater has that in with Paragraph becuse...,0
2,The author concludes the story with that passa...,3
3,The author ended the story with this paragraph...,2
4,The author concludes the story with this parag...,2


# Generate data for Fine Tuning

We run the following tool:

`openai tools fine_tunes.prepare_data -f essay_set4.jsonl -q`

This will generate two data splits for the fine-tuning. One for training and other for validation

In [16]:
train_file = client.files.create(file=open(f"essay_set{ESSAY_SET}_prepared_train.jsonl", "rb"), purpose="fine-tune")
valid_file = client.files.create(file=open(f"essay_set{ESSAY_SET}_prepared_valid.jsonl", "rb"), purpose="fine-tune")
fine_tuning_job = client.fine_tuning.jobs.create(training_file=train_file.id, validation_file=valid_file.id, model="babbage-002")
print(fine_tuning_job)

FineTuningJob(id='ftjob-AW8uX2evtzbjfcfIzkV1nUrJ', created_at=1721583771, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-JihYzTh2GjJjoPtZZ0kQdsbr', result_files=[], seed=137058166, status='validating_files', trained_tokens=None, training_file='file-uyQPENGAQX2VoPCNe8m8jsXV', validation_file='file-20SKzaqhTkQm8x6H5bL1OGS1', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [17]:
texts, scores = (
    df.essay.tolist(),
    df.domain1_score.tolist())

In [18]:
vectorizer = CountVectorizer(min_df=10)

doc_term = vectorizer.fit_transform(df.essay)

In [19]:
# Getting CVA features https://files.eric.ed.gov/fulltext/ED525309.pdf

content_vectors = {}
n = df.shape[0]
ni = doc_term.getnnz(axis=0)
for i in range(min_rating, max_rating + 1):
    essay_score_cat = df[df.domain1_score == i]
    freqs = vectorizer.transform(essay_score_cat.essay)
    
    # Get frequencies score categories
    fis = freqs.sum(axis=0)
    fis = np.asarray(fis).reshape(-1)
    try:
        max_fs = freqs.max()
    except:
        print(i)
        raise
    content_vectors[i] = fis/max_fs * np.log(n / ni)

In [20]:
content_vectors[3].shape

(844,)

In [21]:
source_text = """
It was like walking into another world. A hot, moist world exploding with
greenery. Huge flat leaves, delicate wisps of tendrils, ferns and fronds and vines of
all shades and shapes grew in seemingly random profusion.
“Over there, in the corner, the hibiscus. Is that what you mean?” The florist
pointed at a leafy potted plant by the corner.
There, in a shaft of the wan afternoon sunlight, was a single bloodred blossom,
its five petals splayed back to reveal a long stamen tipped with yellow pollen.
Saeng felt a shock of recognition so intense, it was almost visceral.
“Saebba,” Saeng whispered.
A saebba hedge, tall and lush, had surrounded their garden, its lush green leaves
dotted with vermilion flowers. And sometimes after a monsoon rain, a blossom or
two would have blown into the well, so that when she drew up the well water, she
would find a red blossom floating in the bucket.
Slowly, Saeng walked down the narrow aisle toward the hibiscus. Orchids,
lanna bushes, oleanders, elephant ear begonias, and bougainvillea vines
surrounded her. Plants that she had not even realized she had known but had
forgotten drew her back into her childhood world.
When she got to the hibiscus, she reached out and touched a petal gently. It felt
smooth and cool, with a hint of velvet toward the center—just as she had known it
would feel.
9
THE WINTER HIBISCUS
And beside it was yet another old friend, a small shrub with waxy leaves and
dainty flowers with purplish petals and white centers. “Madagascar periwinkle,”
its tag announced. How strange to see it in a pot, Saeng thought. Back home it just
grew wild, jutting out from the cracks in brick walls or between tiled roofs. There
had been a patch of it by the little spirit house where she used to help her mother
light the incense and candles to the spirit who guarded their home and their family.
Sometimes she would casually pick a flower or two to leave on the offerings of
fruit and rice left at the altar.
And that rich, sweet scent—that was familiar, too. Saeng scanned the greenery
around her and found a tall, gangly plant with exquisite little white blossoms on it.
“Dok Malik,” she said, savoring the feel of the word on her tongue, even as she
silently noted the English name on its tag, “jasmine.”
One of the blossoms had fallen off, and carefully Saeng picked it up and smelled
it. She closed her eyes and breathed in, deeply. The familiar fragrance filled her
lungs, and Saeng could almost feel the light strands of her grandmother’s long gray
hair, freshly washed, as she combed it out with the fine-toothed buffalo-horn
comb. And when the sun had dried it, Saeng would help the gnarled old fingers
knot the hair into a bun, then slip a dok Malik bud into it.
Saeng looked at the white bud in her hand now, small and fragile. Gently, she
closed her palm around it and held it tight. That, at least, she could hold on to. But
where was the finetoothed comb? The hibiscus hedge? The well? Her gentle
grandmother?
A wave of loss so deep and strong that it stung Saeng’s eyes now swept over her.
A blink, a channel switch, a boat ride in the night, and it was all gone.
Irretrievably, irrevocably gone.
And in the warm moist shelter of the greenhouse, Saeng broke down and wept.
It was already dusk when Saeng reached home. The wind was blowing harder,
tearing off the last remnants of green in the chicory weeds that were growing out
of the cracks in the sidewalk. As if oblivious to the cold, her mother was still out in
the vegetable garden, digging up the last of the onions with a rusty trowel. She did
not see Saeng until the girl had quietly knelt down next to her.
Her smile of welcome warmed Saeng. “Ghup ma laio le? You’re back?” she said
cheerfully. “Goodness, it’s past five. What took you so long? How did it go? Did
you—?” Then she noticed the potted plant that Saeng was holding, its leaves
quivering in the wind.
Mrs. Panouvong uttered a small cry of surprise and delight. “Dok faeng-noi!”
she said. “Where did you get it?”
“I bought it,” Saeng answered, dreading her mother’s next question.
“How much?”
For answer Saeng handed her mother some coins.
“That’s all?” Mrs. Panouvong said, appalled. “Oh, but I forgot! You and the
Lambert boy ate Bee-Maags. . . .”
10
THE WINTER HIBISCUS
“No, we didn’t, Mother,” Saeng said.
“Then what else—?”
“Nothing else. I paid over nineteen dollars for it.”
“You what?” Her mother stared at her incredulously. “But how could you? All
the seeds for this vegetable garden didn’t cost that much! You know how much
we—” She paused, as she noticed the tearstains on her daughter’s cheeks and her
puffy eyes.
“What happened?” she asked, more gently.
“I—I failed the test,” Saeng said.
For a long moment Mrs. Panouvong said nothing. Saeng did not dare to look
her mother in the eye. Instead, she stared at the hibiscus plant and nervously tore
off a leaf, shredding it to bits.
Her mother reached out and brushed the fragments of green off Saeng’s hands.
“It’s a beautiful plant, this dok faeng-noi,” she finally said. “I’m glad you got it.”
“It’s—it’s not a real one,” Saeng mumbled. “I mean, not like the kind we had
at—at—” She found that she was still too shaky to say the words at home, lest she
burst into tears again. “Not like the kind we had before,” she said.
“I know,” her mother said quietly. “I’ve seen this kind blooming along the lake.
Its flowers aren’t as pretty, but it’s strong enough to make it through the cold
months here, this winter hibiscus. That’s what matters.”
 She tipped the pot and deftly eased the ball of soil out, balancing the rest of the
plant in her other hand. “Look how rootbound it is, poor thing,” she said. “Let’s
plant it, right now.”
She went over to the corner of the vegetable patch and started to dig a hole in
the ground. The soil was cold and hard, and she had trouble thrusting the shovel
into it. Wisps of her gray hair trailed out in the breeze, and her slight frown
deepened the wrinkles around her eyes. There was a frail, wiry beauty to her that
touched Saeng deeply.
“Here, let me help, Mother,” she offered, getting up and taking the shovel away
from her.
Mrs. Panouvong made no resistance. “I’ll bring in the hot peppers and bitter
melons, then, and start dinner. How would you like an omelet with slices of the
bitter melon?”
“I’d love it,” Saeng said.
Left alone in the garden, Saeng dug out a hole and carefully lowered the “winter
hibiscus” into it. She could hear the sounds of cooking from the kitchen now, the
beating of the eggs against a bowl, the sizzle of hot oil in the pan. The pungent
smell of bitter melon wafted out, and Saeng’s mouth watered. It was a cultivated
taste, she had discovered—none of her classmates or friends, not even Mrs.
Lambert, liked it—this sharp, bitter melon that left a golden aftertaste on the
tongue. But she had grown up eating it and, she admitted to herself, much
preferred it to a Big Mac.
11
THE WINTER HIBISCUS
The “winter hibiscus” was in the ground now, and Saeng tamped down the soil
around it. Overhead, a flock of Canada geese flew by, their faint honks clear and—
yes—familiar to Saeng now. Almost reluctantly, she realized that many of the
things that she had thought of as strange before had become, through the quiet
repetition of season upon season, almost familiar to her now. Like the geese. She
lifted her head and watched as their distinctive V was etched against the evening
sky, slowly fading into the distance.
When they come back, Saeng vowed silently to herself, in the spring, when the
snows melt and the geese return and this hibiscus is budding, then I will take that
test again
""".replace("\n", " ")

In [22]:
source_text

' It was like walking into another world. A hot, moist world exploding with greenery. Huge flat leaves, delicate wisps of tendrils, ferns and fronds and vines of all shades and shapes grew in seemingly random profusion. “Over there, in the corner, the hibiscus. Is that what you mean?” The florist pointed at a leafy potted plant by the corner. There, in a shaft of the wan afternoon sunlight, was a single bloodred blossom, its five petals splayed back to reveal a long stamen tipped with yellow pollen. Saeng felt a shock of recognition so intense, it was almost visceral. “Saebba,” Saeng whispered. A saebba hedge, tall and lush, had surrounded their garden, its lush green leaves dotted with vermilion flowers. And sometimes after a monsoon rain, a blossom or two would have blown into the well, so that when she drew up the well water, she would find a red blossom floating in the bucket. Slowly, Saeng walked down the narrow aisle toward the hibiscus. Orchids, lanna bushes, oleanders, elephant

In [23]:
dalle_list = set()
with open("./dalle_chall.txt", "r") as fp:
    for line in fp:
        for word in word_tokenize(line.strip()):
            dalle_list.add(word.lower())

In [34]:
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
ft_model = fine_tune_results.fine_tuned_model

In [1]:
text = texts[1]


def get_features(text: str, score: int):
    stopwords_set = set(stopwords.words("english"))
    text_blob = TextBlob(text)
    prompt = text

    # Best effort correcting text
    corrected_text = str(text_blob.correct())
    text = re.sub(r"[^a-zA-Z\s.,\']", " ", text)
    corrected_text = re.sub(r"[^a-zA-Z\s.,\']", " ", corrected_text)
    tokens = word_tokenize(corrected_text)
    tokens_prev = word_tokenize(text)

    # Estimate errors
    num_errors = sum(1 for w1, w2 in zip(tokens, tokens_prev) if w1 != w2)

    sentences = [word_tokenize(sent) for sent in sent_tokenize(text)]
    text_blob = TextBlob(corrected_text)

    features = {"num_errors": num_errors}

    # Surface features
    num_characters = len(text)
    words = [word for word in tokens if len(word) > 1]
    word_count = len(words)
    average_word_length = sum(len(word) for word in words) / len(words)
    num_sentences = len(sentences)
    average_sentence_length = sum(len(sent) for sent in sentences) / len(sentences)
    num_different_words = len(set(words))
    num_of_stopwords = len([word for word in words if word in stopwords_set])

    features.update(
        {
            "num_characters": num_characters,
            "word_count": word_count,
            "average_word_length": average_word_length,
            "num_sentences": num_sentences,
            "average_sentence_length": average_sentence_length,
            "num_different_words": num_different_words,
            "num_of_stopwords": num_of_stopwords,
        }
    )

    syllable_estimates = [syllables.estimate(word) for word in words]
    syllable_count = sum(syllable_estimates)

    # Readability
    flesch_reading_ease = (
        206.835
        - 1.015 * (word_count / num_sentences)
        - 84.6 * (syllable_count / word_count)
    )
    flesch_kincaid_grade_level = (
        0.39 * (word_count / num_sentences)
        + 11.8 * (syllable_count / word_count)
        - 15.59
    )

    difficult_word_count = len([word for word in words if word not in dalle_list])
    dalle_chall_readability = 0.1579 * (
        difficult_word_count / word_count * 100
    ) + 0.0496 * (word_count / num_sentences)
    # Automated readability index
    ari = (
        4.71 * (num_characters / word_count)
        + 0.5 * (word_count / num_sentences)
        - 21.43
    )

    group_t = sentences[0:10]
    group_m = sentences[len(sentences) // 2 - 5 : len(sentences) // 2 + 5]
    group_b = sentences[-10:]
    nsw = 0
    for sent in group_t + group_m + group_b:
        for word in sent:
            if syllables.estimate(word) >= 3:
                nsw += 1

    smog = 1.043 * np.sqrt(nsw) * 30 / len(sentences) + 3.1291

    # LIX
    B = len([w for w in tokens if w[0].isupper() or len(w) == 1])
    C = len([w for w in words if len(w) > 6])
    lix = word_count / B + (C * 100) / word_count

    wvi = np.log(word_count) / np.log(
        2 - np.log(num_different_words) / np.log(word_count)
    )
    gunning_fog_index = 0.4 * ((word_count / num_sentences) + 100 * (nsw / word_count))
    

    features.update(
        {
            "flesch_reading_ease": flesch_reading_ease,
            "flesch_kincaid_grade_level": flesch_kincaid_grade_level,
            "dalle_chall_readability": dalle_chall_readability,
            "ari": ari,
            "smog": smog,
            "lix": lix,
            "wvi": wvi,
            "gunning_fog_index": gunning_fog_index,
        }
    )

    # Lexical diversity
    ttr = num_different_words / word_count
    rs, word_counts = defaultdict(int), defaultdict(int)
    for word in words:
        word_counts[word] += 1
    for _, r in word_counts.items():
        rs[r] += 1

    yule_k = 1e4 * (sum(r**2 * vr for r, vr in rs.items()) - word_count) / word_count**2

    min_range, max_range, trials = 35, 50, 5
    ns = np.arange(min_range, max_range + 1)
    ttrs = []
    for idx, sample_size in enumerate(ns):
        ttr = 0
        if sample_size <= len(words):
            for trial in range(trials):
                word_list = np.random.choice(words, sample_size, replace=False)
                ttr += len(set(word_list)) / len(word_list)
            ttrs.append(ttr / trials)
    ttrs = np.array(ttrs)
    A = np.vstack([2 * (1 - ttrs) / ns[0:len(ttrs)]]).T
    y = ttrs**2
    d = np.linalg.lstsq(A, y, rcond=None)[0]
    d_estimate = d[0]
    hapax_legomena = rs[1]

    guiraud = num_different_words / np.sqrt(word_count)
    advanced_guiraud = difficult_word_count / np.sqrt(word_count)
    features.update(
        {
            "ttr": ttr,
            "yule_k": yule_k,
            "d_estimate": d_estimate,
            "hapax_legomena": rs[1],
            "guiraud": guiraud,
            "advanced_guiraud": advanced_guiraud,
        }
    )

    # POS tags
    pos_tags = defaultdict(int)
    total_pos_tags = 0
    unique_pos_tags = 0
    for _, tag in text_blob.tags:
        if tag not in pos_tags:
            unique_pos_tags += 1
        pos_tags[tag] += 1
        total_pos_tags += 1

    pos_dist = {}
    for tag, count in pos_tags.items():
        pos_dist[tag] = count/total_pos_tags
        
    features["total_pos_tags"] = total_pos_tags
    features["unique_pos_tags"] = unique_pos_tags
    features.update(pos_dist)
    features.update(pos_tags)
    
    # Discourse patterns
    doc = nlp(corrected_text)
    egrid = EntityGrid(doc)
    (
        local_coherence_PU,
        local_coherence_PW,
        local_coherence_PACC,
        local_coherence_PU_dist,
        local_coherence_PW_dist,
        local_coherence_PACC_dist,
    ) = get_local_coherence(egrid)
    
    features.update({
        "ss_transitions": egrid.get_ss_transitions(),
        "so_transitions": egrid.get_so_transitions(),
        "sx_transitions": egrid.get_sx_transitions(),
        "sn_transitions": egrid.get_sn_transitions(),
        "os_transitions": egrid.get_os_transitions(),
        "oo_transitions": egrid.get_oo_transitions(),
        "ox_transitions": egrid.get_ox_transitions(),
        "on_transitions": egrid.get_on_transitions(),
        "xs_transitions": egrid.get_xs_transitions(),
        "xo_transitions": egrid.get_xo_transitions(),
        "xx_transitions": egrid.get_xx_transitions(),
        "xn_transitions": egrid.get_xn_transitions(),
        "ns_transitions": egrid.get_ns_transitions(),
        "no_transitions": egrid.get_no_transitions(),
        "nx_transitions": egrid.get_nx_transitions(),
        "nn_transitions": egrid.get_nn_transitions(),
        "local_coherence_PU": local_coherence_PU,
        "local_coherence_PW": local_coherence_PW,
        "local_coherence_PACC": local_coherence_PACC,
        "local_coherence_PU_dist": local_coherence_PU_dist,
        "local_coherence_PW_dist": local_coherence_PW_dist,
        "local_coherence_PACC_dist": local_coherence_PACC_dist,
    })

    # CVA Features
    
    # Compute weight scores for the essay
    fi = vectorizer.transform([text])
    max_f = fi.max()
    wi = (fi / max_f).toarray() * np.log(n / ni)


    # Maximum similarity to the best score category
    max_sim_best = np.dot(wi, content_vectors[max_rating])/(norm(wi)*norm(content_vectors[max_rating]))
    max_sim_best = max_sim_best[0]

    pattern_cosine = 0
    val_cos = 0
    max_cos = 0
    for i in range(min_rating, max_rating + 1):
        # Similarity between content vectors of category i and the essay vector
        cos = np.dot(wi, content_vectors[i])/(norm(wi)*norm(content_vectors[i]))
        pattern_cosine += i * cos[0]
        
        # This is for the val cosine, e.g. cos_4 + cos_3 - cos_2 - cos_1
        if i < (min(scores) + max(scores)) // 2:
            val_cos -= cos[0]
        else:
            val_cos += cos[0]

        # We are also looking for the score category closest to the essay
        if cos[0] >= max_cos:
            max_cos_val = i
            max_cos = cos[0]
            
    fsource = vectorizer.transform([source_text])
    max_fsource = fsource.max()
    wsource = (fi / max_fsource).toarray() * np.log(n / ni)
    cos_source = np.dot(wi.squeeze(), wsource.squeeze())/(norm(wi)*norm(wsource))
    
    features.update({
        "max_cos_val": max_cos_val,
        "max_sim_best": max_sim_best,
        "pattern_cosine": pattern_cosine,
        "val_cos": val_cos,
        "similarity_source_text": cos_source,
    })
    
    # GPT4 fine-tuned model as a feature
    prompt = prompt + '\n\n###\n\n'
    res = client.completions.create(
        model=ft_model,
        prompt=prompt,
        max_tokens=2, temperature=0)

    gpt_score = int(re.sub(r"[^0-9]", "", res.choices[0].text)) % 10
    features["gpt_score"] = gpt_score
    
# These are experimental, based on spatial measurements
#     dist = cosine_distances(embeddings)
#     nn_distances = np.min(dist + np.diag(np.diag(dist) + 10), axis=1)
#     avg_nn_distance = np.mean(nn_distances)
#     max_nn_distance = np.max(nn_distances)
#     min_nn_distance = np.min(nn_distances)
#     r_distance = 2*np.sqrt(dist.shape[0])*avg_nn_distance
#     cum_freq_dist_nn_dist = np.mean(nn_distances <= avg_nn_distance)
#     givenness = []
#     for i in range(2, len(embeddings)):
#         x = embeddings[0:i]
#         u, s, vh = np.linalg.svd(x)
#         orthonormal_vector = vh[-1]
#         givenness.append(np.dot(embeddings[i], orthonormal_vector))

#     avg_givenness = np.mean(givenness)
#     max_givenness = np.max(givenness)
#     min_givenness = np.min(givenness)
#     givenness_proj = []
#     for i in range(2, len(embeddings)):
#         x = np.array(embeddings[0:i])
#         A = x.T
#         b = embeddings[i]
#         c = np.linalg.lstsq(A, b, rcond=None)[0]
#         bw = A.dot(c)
#         bwo = b - bw
#         N = np.dot(b, bwo)
#         G = np.dot(b, bw)
#         givenness_proj.append(N / (G + N))

#     avg_givenness_proj = np.mean(givenness_proj)
#     min_givenness_proj = np.min(givenness_proj)
#     max_givenness_proj = np.max(givenness_proj)
#     max_min_giv_ratio = max_givenness / min_givenness
#     centroid = np.sum(embeddings, axis=0) / len(embeddings)
#     dist_to_centroid = []
#     for embedding in embeddings:
#         dist_to_centroid.append(np.dot(centroid, embedding))

#     avg_dist_to_centroid = np.mean(dist_to_centroid)
#     max_dist_to_centroid = np.max(dist_to_centroid)
#     min_dist_to_centroid = np.min(dist_to_centroid)
#     std_distance = np.sqrt(np.sum(np.sum(
#         (embeddings - centroid)**2, axis=1))/len(embeddings))
#     relative_distance = std_distance / max_dist_to_centroid
#     det_dist = np.linalg.det(dist)

#     features.update({
#         "avg_nn_distance": avg_nn_distance,
#         "max_nn_distance": max_nn_distance,
#         "min_nn_distance": min_nn_distance,
#         "r_distance": r_distance,
#         "cum_freq_dist_nn_dist": cum_freq_dist_nn_dist,
#         "avg_givenness": avg_givenness,
#         "max_givenness": max_givenness,
#         "min_givenness": min_givenness,
#         "max_min_giv_ratio": max_min_giv_ratio,
#         "avg_givenness_proj": avg_givenness_proj,
#         "min_givenness_proj": min_givenness_proj,
#         "max_givenness_proj": max_givenness_proj,
#         "max_min_giv_ratio": max_min_giv_ratio,
#         "avg_dist_to_centroid": avg_dist_to_centroid,
#         "max_dist_to_centroid": max_dist_to_centroid,
#         "min_dist_to_centroid": min_dist_to_centroid,
#         "std_distance": std_distance,
#         "relative_distance": relative_distance,
#         "det_dist": det_dist,
#     })
    features["score"] = score

    return features

# print(get_features(texts[0], scores[1]))
args = list(zip(texts, scores))
train_result = pqdm(args, get_features, n_jobs=8, argument_type="args")

NameError: name 'texts' is not defined

In [40]:
train_results = []
for idx, res in enumerate(train_result):
    if type(res) != dict:
        continue
    train_results.append(res)
        
essay_set_features = pd.DataFrame(train_results).fillna(0)

In [41]:
X, y = essay_set_features.drop(["score", "gpt_score"], axis=1), essay_set_features.score

In [42]:
clf = RandomForestRegressor(
    n_estimators=200, max_features=1/3, max_depth=12)

In [43]:
def essay_metrics(clf, X, y_true):
    y_pred = np.clip(np.round(clf.predict(X)), min_rating, max_rating)
    qwk = quadratic_weighted_kappa(y_true, y_pred, min_rating=min_rating, max_rating=max_rating)
    ea = np.sum(y_true == y_pred) / y_true.shape[0]
    aa = np.sum((y_true - y_pred) <= 1) / y_true.shape[0]
    return {
        "qwk": qwk,
        "ea": ea,
        "aa": aa
    }

In [44]:
assert min_rating == y.min()
assert max_rating == y.max()

# DP Model Alone

In [45]:
from sklearn.model_selection import cross_validate

results = cross_validate(clf, X, y, scoring=essay_metrics, cv=10)

In [46]:
np.max(results["test_qwk"]), np.max(results["test_ea"]), np.max(results["test_aa"])

(0.8406405146493252, 0.7610062893081762, 1.0)

# GPT Model Alone

In [47]:
y_pred = essay_set_features.gpt_score
y_true = essay_set_features.score
qwk = quadratic_weighted_kappa(y_true, y_pred, min_rating=min_rating, max_rating=max_rating)
ea = np.sum(y_true == y_pred) / y_true.shape[0]
aa = np.sum((y_true - y_pred) <= 1) / y_true.shape[0]
qwk, ea, aa

(0.7464776849742681, 0.6735849056603773, 0.9974842767295597)

# Hybrid Model

In [48]:
X, y = essay_set_features.drop(["score"], axis=1), essay_set_features.score
clf = RandomForestRegressor(
    n_estimators=200, max_features=1/3, max_depth=12)
results = cross_validate(clf, X, y, scoring=essay_metrics, cv=10)
np.max(results["test_qwk"]), np.max(results["test_ea"]), np.max(results["test_aa"])

(0.831983092638253, 0.7421383647798742, 1.0)