In [1]:
from typing import List
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import syllables
from collections import defaultdict
from pqdm.processes import pqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
import re
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import pandas as pd
import os
import spacy
from numpy.linalg import norm
from tqdm import tqdm

import os
from openai import OpenAI


client = OpenAI(api_key=os.environ["OPEN_AI_API_KEY"])
nlp = spacy.load("en_core_web_sm")
ESSAY_SET = 6

# Define QWK Metric functions

In [2]:
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the quadratic weighted kappa
    quadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the linear weighted kappa
    linear_weighted_kappa calculates the linear weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    linear_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = abs(i - j) / float(num_ratings - 1)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the kappa
    kappa calculates the kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            if i == j:
                d = 0.0
            else:
                d = 1.0
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def mean_quadratic_weighted_kappa(kappas, weights=None):
    """
    Calculates the mean of the quadratic
    weighted kappas after applying Fisher's r-to-z transform, which is
    approximately a variance-stabilizing transformation.  This
    transformation is undefined if one of the kappas is 1.0, so all kappa
    values are capped in the range (-0.999, 0.999).  The reverse
    transformation is then applied before returning the result.

    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
    kappa values

    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
    of weights that is the same size as kappas.  Weights are applied in the
    z-space
    """
    kappas = np.array(kappas, dtype=float)
    if weights is None:
        weights = np.ones(np.shape(kappas))
    else:
        weights = weights / np.mean(weights)

    # ensure that kappas are in the range [-.999, .999]
    kappas = np.array([min(x, .999) for x in kappas])
    kappas = np.array([max(x, -.999) for x in kappas])

    z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights
    z = np.mean(z)
    return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)


def weighted_mean_quadratic_weighted_kappa(solution, submission):
    predicted_score = submission[submission.columns[-1]].copy()
    predicted_score.name = "predicted_score"
    if predicted_score.index[0] == 0:
        predicted_score = predicted_score[:len(solution)]
        predicted_score.index = solution.index
    combined = solution.join(predicted_score, how="left")
    groups = combined.groupby(by="essay_set")
    kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups]
    weights = [group[1]["essay_weight"].irow(0) for group in groups]
    return mean_quadratic_weighted_kappa(kappas, weights=weights)

In [3]:
UNIVERSAL_NOUN_TAGS = set([u"NOUN", u"PRON", u"PROPN"])

ordered_transitions = [
    u"SS",
    u"SO",
    u"SX",
    u"S-",
    u"OS",
    u"OO",
    u"OX",
    u"O-",
    u"XS",
    u"XO",
    u"XX",
    u"X-",
    u"-S",
    u"-O",
    u"-X",
    u"--",
]


def dependency_mapping(dep: str) -> str:
    """Map dependency tag to entity grid tag.

    We consider the notation provided in :cite:`barzilay2008modeling`:

    +-----------+-----------------------------------+
    | EGrid Tag | Dependency Tag                    |
    +===========+===================================+
    | S         | nsub, csubj, csubjpass, dsubjpass |
    +-----------+-----------------------------------+
    | O         | iobj, obj, pobj, dobj             |
    +-----------+-----------------------------------+
    | X         | For any other dependency tag      |
    +-----------+-----------------------------------+

    :param dep: Dependency tag
    :type dep: string
    :return: EGrid tag
    :rtype: string
    """
    if dep in {u"nsubj", u"csubj", u"csubjpass", u"dsubjpass"}:
        return u"S"
    if dep in {u"iobj", u"obj", u"pobj", u"dobj"}:
        return u"O"

    return "X"


class EntityGrid(object):
    """Entity grid class.

    Class Entity Grid, creates an entity grid from a doc, which is output of
    applying spacy.nlp(text) to a text. Thus, this class depends on spacy
    module. It only supports 2-transitions entity grid.
    """

    def __init__(self, doc):
        """Construct EntityGrid object."""
        # Initialization
        entity_map = dict()
        entity_grid = dict()
        i = 1
        entity_map["s%d" % i] = []
        entity_features = {
            u"SS": 0,
            u"SO": 0,
            u"SX": 0,
            u"S-": 0,
            u"OS": 0,
            u"OO": 0,
            u"OX": 0,
            u"O-": 0,
            u"XS": 0,
            u"XO": 0,
            u"XX": 0,
            u"X-": 0,
            u"-S": 0,
            u"-O": 0,
            u"-X": 0,
            u"--": 0,
        }


        n_sent = len(list(doc.sents))

        # To get coherence measurements we need at least 2 sentences
        if n_sent < 2:
            raise RuntimeError(
                "Entity grid needs at least two sentences, found: {}".format(
                    n_sent
                )
            )

        # For each sentence, get dependencies and its grammatical role
        for sent in doc.sents:
            for token in sent:
                if token.pos_ in UNIVERSAL_NOUN_TAGS:
                    entity_map["s%d" % i].append(
                        (token.text.upper(), token.dep_)
                    )
                    if token.text.upper() not in entity_grid:
                        entity_grid[token.text.upper()] = [u"-"] * n_sent
            i += 1
            entity_map["s%d" % i] = []

        # Last iteration will create an extra element, so I remove it.
        entity_map.pop("s%d" % i)

        # Fill entity grid
        for i in range(n_sent):
            sentence = "s%d" % (i + 1)
            for entity, dep in entity_map[sentence]:
                if entity_grid[entity][i] == u"-":
                    entity_grid[entity][i] = dependency_mapping(dep)
                elif dependency_mapping(dep) == u"S":
                    entity_grid[entity][i] = dependency_mapping(dep)
                elif (
                    dependency_mapping(dep) == u"O"
                    and entity_grid[entity][i] == u"X"
                ):
                    entity_grid[entity][i] = dependency_mapping(dep)

        # Compute feature vector, we consider transitions of length 2
        total_transitions = (n_sent - 1) * len(entity_grid.keys())

        for entity in entity_grid:
            for i in range(n_sent - 1):
                # Transition type found (e.g. S-)
                transition = (
                    entity_grid[entity][i] + entity_grid[entity][i + 1]
                )

                # Adding 1 to transition count
                entity_features[transition] += 1

        for prob in entity_features:
            if total_transitions != 0:
                entity_features[prob] /= float(total_transitions)
            else:
                entity_features[prob] = 0.0

        self.__grid = entity_grid
        self.__n_sent = n_sent
        self.__prob = entity_features

    def get_ss_transitions(self) -> float:
        """Get SS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SS"]

    def get_so_transitions(self) -> float:
        """Get SO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SO"]

    def get_sx_transitions(self) -> float:
        """Get SX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"SX"]

    def get_sn_transitions(self) -> float:
        """Get S- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"S-"]

    def get_os_transitions(self) -> float:
        """Get OS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OS"]

    def get_oo_transitions(self) -> float:
        """Get OO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OO"]

    def get_ox_transitions(self) -> float:
        """Get OX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"OX"]

    def get_on_transitions(self) -> float:
        """Get O- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"O-"]

    def get_xs_transitions(self) -> float:
        """Get XS transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XS"]

    def get_xo_transitions(self) -> float:
        """Get XO transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XO"]

    def get_xx_transitions(self) -> float:
        """Get XX transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"XX"]

    def get_xn_transitions(self) -> float:
        """Get X- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"X-"]

    def get_ns_transitions(self) -> float:
        """Get -S transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-S"]

    def get_no_transitions(self) -> float:
        """Get -O transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-O"]

    def get_nx_transitions(self) -> float:
        """Get -X transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"-X"]

    def get_nn_transitions(self) -> float:
        """Get -- transitions.

        :return: Ratio of transitions
        :rtype: float
        """
        return self.__prob[u"--"]

    def get_egrid(self) -> dict:
        """Return obtained entity grid (for debugging purposes).

        :return: entity grid represented as a dict
        :rtype: dict
        """
        return self.__grid

    def get_sentence_count(self) -> int:
        """Return sentence count obtained while processing.

        :return: Number of sentences
        :rtype: int
        """
        return self.__n_sent

In [4]:
def weighting_syntactic_role(entity_role: str) -> int:
    """Return weight given an entity grammatical role.

    Weighting scheme for syntactic role of an entity. This uses the heuristic
    from :cite:`guinaudeau2013graph`, which is:

    +-----------+--------+
    | EGrid Tag | Weight |
    +===========+========+
    | S         | 3      |
    +-----------+--------+
    | O         | 2      |
    +-----------+--------+
    | X         | 1      |
    +-----------+--------+
    | dash      | 0      |
    +-----------+--------+

    :param entity_role: Entity grammatical role (S, O, X, -)
    :type entity_role: string
    :return: Role weight
    :rtype: int
    """
    if entity_role == u"S":
        return 3
    elif entity_role == u"O":
        return 2
    elif entity_role == u"X":
        return 1

    return 0


def get_local_coherence(egrid: EntityGrid) -> [float, float, float, float]:
    """Get local coherence from entity grid.

    This method gets the coherence value using all the approaches described
    in :cite:`guinaudeau2013graph`. This include:

    * local_coherence_PU
    * local_coherence_PW
    * local_coherence_PACC
    * local_coherence_PU_dist
    * local_coherence_PW_dist
    * local_coherence_PACC_dist

    :param egrid: An EntityGrid object.
    :type egrid: EntityGrid
    :return: Local coherence based on different heuristics
    :rtype: tuple of floats
    """
    n_sent = egrid.get_sentence_count()

    # If entity grid is not valid
    if n_sent < 2:
        return (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

    PW = [[0] * n_sent for i in range(n_sent)]

    # Weight Matrix for PACC, syntactic information is accounted for by
    # integrating the edges of the bipartite graph
    W = [[0] * n_sent for i in range(n_sent)]

    grid = egrid.get_egrid()
    for entity in grid:
        for i in range(n_sent):
            for j in range(i + 1, n_sent):
                if grid[entity][i] != u"-" and grid[entity][j] != u"-":
                    PW[i][j] += 1
                    W[i][j] += weighting_syntactic_role(
                        grid[entity][i]
                    ) * weighting_syntactic_role(grid[entity][j])

    PU = [list(map(lambda x: x != 0, PWi)) for PWi in PW]

    local_coherence_PU = 0.0
    local_coherence_PW = 0.0
    local_coherence_PACC = 0.0
    for i in range(n_sent):
        local_coherence_PW += sum(PW[i])
        local_coherence_PU += sum(PU[i])
        local_coherence_PACC += sum(W[i])

    local_coherence_PW /= n_sent
    local_coherence_PU /= n_sent
    local_coherence_PACC /= n_sent

    # Weighting projection graphs
    PU_weighted = list(PU)
    PW_weighted = list(PW)
    PACC_weighted = list(W)
    for i in range(n_sent):
        for j in range(i + 1, n_sent):
            PU_weighted[i][j] = PU[i][j] / float(j - i)
            PW_weighted[i][j] = PW[i][j] / float(j - i)
            PACC_weighted[i][j] = W[i][j] / float(j - i)

    local_coherence_PU_dist = 0.0
    local_coherence_PW_dist = 0.0
    local_coherence_PACC_dist = 0.0
    for i in range(n_sent):
        local_coherence_PW_dist += sum(PW_weighted[i])
        local_coherence_PU_dist += sum(PU_weighted[i])
        local_coherence_PACC_dist += sum(PACC_weighted[i])

    local_coherence_PW_dist /= n_sent
    local_coherence_PU_dist /= n_sent
    local_coherence_PACC_dist /= n_sent
    return (
        local_coherence_PU,
        local_coherence_PW,
        local_coherence_PACC,
        local_coherence_PU_dist,
        local_coherence_PW_dist,
        local_coherence_PACC_dist,
    )

In [5]:
df = pd.read_csv(
    f"./training_set_rel3.tsv",
    sep="\t",
    encoding="ISO-8859-1",
    usecols=["essay_id", "essay_set", "essay", "domain1_score", "domain2_score"],
)

In [6]:
from collections import Counter

df = df[df.essay_set == ESSAY_SET]
Counter(df.domain1_score)

Counter({2: 405, 3: 817, 4: 367, 1: 167, 0: 44})

In [7]:
min_rating, max_rating = int(df.domain1_score.min()), int(df.domain1_score.max())

In [8]:
df_ft = pd.DataFrame({
    "prompt": df.essay.tolist(),
    "completion": df.domain1_score.tolist()})

df_ft.to_json(f"essay_set{ESSAY_SET}.jsonl", orient='records', lines=True)

In [9]:
df_ft.head()

Unnamed: 0,prompt,completion
0,There were many obstacles that the builders fa...,2
1,"Him from the start, there would have been many...",3
2,The builders of the Empire State Building face...,4
3,In the passage The Mooring Mast by Marcia Amid...,1
4,The builders of the Empire State Building face...,3


# Generate data for Fine Tuning

We run the following tool:

`openai tools fine_tunes.prepare_data -f essay_set6.jsonl -q`

This will generate two data splits for the fine-tuning. One for training and other for validation

In [10]:
train_file = client.files.create(file=open(f"essay_set{ESSAY_SET}_prepared_train.jsonl", "rb"), purpose="fine-tune")
valid_file = client.files.create(file=open(f"essay_set{ESSAY_SET}_prepared_valid.jsonl", "rb"), purpose="fine-tune")
fine_tuning_job = client.fine_tuning.jobs.create(training_file=train_file.id, validation_file=valid_file.id, model="babbage-002")
print(fine_tuning_job)

FineTuningJob(id='ftjob-jcg8J3u38NZhMhFxqoyakQDS', created_at=1721591701, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-JihYzTh2GjJjoPtZZ0kQdsbr', result_files=[], seed=189849559, status='validating_files', trained_tokens=None, training_file='file-jAqUSJPrHJVYA08xF3emW57W', validation_file='file-ROwYPwST2kEUcgcActpQNAkP', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [11]:
texts, scores = (
    df.essay.tolist(),
    df.domain1_score.tolist())

In [12]:
vectorizer = CountVectorizer(min_df=10)

doc_term = vectorizer.fit_transform(df.essay)

In [13]:
# Getting CVA features https://files.eric.ed.gov/fulltext/ED525309.pdf

content_vectors = {}
n = df.shape[0]
ni = doc_term.getnnz(axis=0)
for i in range(min_rating, max_rating + 1):
    essay_score_cat = df[df.domain1_score == i]
    freqs = vectorizer.transform(essay_score_cat.essay)
    
    # Get frequencies score categories
    fis = freqs.sum(axis=0)
    fis = np.asarray(fis).reshape(-1)
    try:
        max_fs = freqs.max()
    except:
        print(i)
        raise
    content_vectors[i] = fis/max_fs * np.log(n / ni)

In [14]:
content_vectors[3].shape

(1107,)

In [16]:
source_text = """
When the Empire State Building was conceived, it was planned as the world’s tallest building, taller even than the new Chrysler Building that was being constructed at Forty-second Street and Lexington Avenue in New York. At seventy-seven stories, it was the tallest building before the Empire State began construction, and Al Smith was determined to outstrip it in height.
The architect building the Chrysler Building, however, had a trick up his sleeve. He secretly constructed a 185-foot spire inside the building, and then shocked the public and the media by hoisting it up to the top of the Chrysler Building, bringing it to a height of 1,046 feet, 46 feet taller than the originally announced height of the Empire State Building.
Al Smith realized that he was close to losing the title of world’s tallest building, and on December 11, 1929, he announced that the Empire State would now reach the height of 1,250 feet. He would add a top or a hat to the building that would be even more distinctive than any other building in the city. John Tauranac describes the plan:
[The top of the Empire State Building] would be more than ornamental, more than a spire or dome or a pyramid put there to add a desired few feet to the height of the building or to mask something as mundane as a water tank. Their top, they said, would serve a higher calling. The Empire State Building would be equipped for an age of transportation that was then only the dream of aviation pioneers.
This dream of the aviation pioneers was travel by dirigible, or zeppelin, and the Empire State Building was going to have a mooring mast at its top for docking these new airships, which would accommodate passengers on already existing transatlantic routes and new routes that were yet to come.
The Age of Dirigibles
By the 1920s, dirigibles were being hailed as the transportation of the future. Also known today as blimps, dirigibles were actually enormous steel-framed balloons, with envelopes of cotton fabric filled with hydrogen and helium to make them lighter than air. Unlike a balloon, a dirigible could be maneuvered by the use of propellers and rudders, and passengers could ride in the gondola, or enclosed compartment, under the balloon.
Dirigibles had a top speed of eighty miles per hour, and they could cruise at seventy miles per hour for thousands of miles without needing refueling. Some were as long as one thousand feet, the same length as four blocks in New York City. The one obstacle to their expanded use in New York City was the lack of a suitable landing area. Al Smith saw an opportunity for his Empire State Building: A mooring mast added to the top of the building would allow dirigibles to anchor there for several hours for refueling or service, and to let passengers off and on. Dirigibles were docked by means of an electric winch, which hauled in a line from the front of the ship and then tied it to a mast. The body of the dirigible could swing in the breeze, and yet passengers could safely get on and off the dirigible by walking down a gangplank to an open observation platform.
The architects and engineers of the Empire State Building consulted with experts, taking tours of the equipment and mooring operations at the U.S. Naval Air Station in Lakehurst, New Jersey. The navy was the leader in the research and development of dirigibles in the United States. The navy even offered its dirigible, the Los Angeles, to be used in testing the mast. The architects also met with the president of a recently formed airship transport company that planned to offer dirigible service across the Pacific Ocean.
When asked about the mooring mast, Al Smith commented:
[It’s] on the level, all right. No kidding. We’re working on the thing now. One set of engineers here in New York is trying to dope out a practical, workable arrangement and the Government people in Washington are figuring on some safe way of mooring airships to this mast.
Designing the Mast
The architects could not simply drop a mooring mast on top of the Empire State Building’s flat roof. A thousand-foot dirigible moored at the top of the building, held by a single cable tether, would add stress to the building’s frame. The stress of the dirigible’s load and the wind pressure would have to be transmitted all the way to the building’s foundation, which was nearly eleven hundred feet below. The steel frame of the Empire State Building would have to be modified and strengthened to accommodate this new situation. Over sixty thousand dollars’ worth of modifications had to be made to the building’s framework.
Rather than building a utilitarian mast without any ornamentation, the architects designed a shiny glass and chrome-nickel stainless steel tower that would be illuminated from inside, with a stepped-back design that imitated the overall shape of the building itself. The rocket-shaped mast would have four wings at its corners, of shiny aluminum, and would rise to a conical roof that would house the mooring arm. The winches and control machinery for the dirigible mooring would be housed in the base of the shaft itself, which also housed elevators and stairs to bring passengers down to the eighty-sixth floor, where baggage and ticket areas would be located.
The building would now be 102 floors, with a glassed-in observation area on the 101st floor and an open observation platform on the 102nd floor. This observation area was to double as the boarding area for dirigible passengers.
Once the architects had designed the mooring mast and made changes to the existing plans for the building’s skeleton, construction proceeded as planned. When the building had been framed to the 85th floor, the roof had to be completed before the framing for the mooring mast could take place. The mast also had a skeleton of steel and was clad in stainless steel with glass windows. Two months after the workers celebrated framing the entire building, they were back to raise an American flag again—this time at the top of the frame for the mooring mast.
The Fate of the Mast
The mooring mast of the Empire State Building was destined to never fulfill its purpose, for reasons that should have been apparent before it was ever constructed. The greatest reason was one of safety: Most dirigibles from outside of the United States used hydrogen rather than helium, and hydrogen is highly flammable. When the German dirigible Hindenburg was destroyed by fire in Lakehurst, New Jersey, on May 6, 1937, the owners of the Empire State Building realized how much worse that accident could have been if it had taken place above a densely populated area such as downtown New York.
The greatest obstacle to the successful use of the mooring mast was nature itself. The winds on top of the building were constantly shifting due to violent air currents. Even if the dirigible were tethered to the mooring mast, the back of the ship would swivel around and around the mooring mast. Dirigibles moored in open landing fields could be weighted down in the back with lead weights, but using these at the Empire State Building, where they would be dangling high above pedestrians on the street, was neither practical nor safe.
The other practical reason why dirigibles could not moor at the Empire State Building was an existing law against airships flying too low over urban areas. This law would make it illegal for a ship to ever tie up to the building or even approach the area, although two dirigibles did attempt to reach the building before the entire idea was dropped. In December 1930, the U.S. Navy dirigible Los Angeles approached the mooring mast but could not get close enough to tie up because of forceful winds. Fearing that the wind would blow the dirigible onto the sharp spires of other buildings in the area, which would puncture the dirigible’s shell, the captain could not even take his hands off the control levers. 
Two weeks later, another dirigible, the Goodyear blimp Columbia, attempted a publicity stunt where it would tie up and deliver a bundle of newspapers to the Empire State Building. Because the complete dirigible mooring equipment had never been installed, a worker atop the mooring mast would have to catch the bundle of papers on a rope dangling from the blimp. The papers were delivered in this fashion, but after this stunt the idea of using the mooring mast was shelved. In February 1931, Irving Clavan of the building’s architectural office said, “The as yet unsolved problems of mooring air ships to a fixed mast at such a height made it desirable to postpone to a later date the final installation of the landing gear.”
By the late 1930s, the idea of using the mooring mast for dirigibles and their passengers had quietly disappeared. Dirigibles, instead of becoming the transportation of the future, had given way to airplanes. The rooms in the Empire State Building that had been set aside for the ticketing and baggage of dirigible passengers were made over into the world’s highest soda fountain and tea garden for use by the sightseers who flocked to the observation decks. The highest open observation deck, intended for disembarking passengers, has never been open to the public.
""".replace("\n", " ")

In [17]:
source_text

' When the Empire State Building was conceived, it was planned as the world’s tallest building, taller even than the new Chrysler Building that was being constructed at Forty-second Street and Lexington Avenue in New York. At seventy-seven stories, it was the tallest building before the Empire State began construction, and Al Smith was determined to outstrip it in height. The architect building the Chrysler Building, however, had a trick up his sleeve. He secretly constructed a 185-foot spire inside the building, and then shocked the public and the media by hoisting it up to the top of the Chrysler Building, bringing it to a height of 1,046 feet, 46 feet taller than the originally announced height of the Empire State Building. Al Smith realized that he was close to losing the title of world’s tallest building, and on December 11, 1929, he announced that the Empire State would now reach the height of 1,250 feet. He would add a top or a hat to the building that would be even more distinc

In [18]:
dalle_list = set()
with open("./dalle_chall.txt", "r") as fp:
    for line in fp:
        for word in word_tokenize(line.strip()):
            dalle_list.add(word.lower())

In [19]:
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
ft_model = fine_tune_results.fine_tuned_model

In [22]:
text = texts[1]


def get_features(text: str, score: int):
    stopwords_set = set(stopwords.words("english"))
    text_blob = TextBlob(text)
    prompt = text

    # Best effort correcting text
    corrected_text = str(text_blob.correct())
    text = re.sub(r"[^a-zA-Z\s.,\']", " ", text)
    corrected_text = re.sub(r"[^a-zA-Z\s.,\']", " ", corrected_text)
    tokens = word_tokenize(corrected_text)
    tokens_prev = word_tokenize(text)

    # Estimate errors
    num_errors = sum(1 for w1, w2 in zip(tokens, tokens_prev) if w1 != w2)

    sentences = [word_tokenize(sent) for sent in sent_tokenize(text)]
    text_blob = TextBlob(corrected_text)

    features = {"num_errors": num_errors}

    # Surface features
    num_characters = len(text)
    words = [word for word in tokens if len(word) > 1]
    word_count = len(words)
    average_word_length = sum(len(word) for word in words) / len(words)
    num_sentences = len(sentences)
    average_sentence_length = sum(len(sent) for sent in sentences) / len(sentences)
    num_different_words = len(set(words))
    num_of_stopwords = len([word for word in words if word in stopwords_set])

    features.update(
        {
            "num_characters": num_characters,
            "word_count": word_count,
            "average_word_length": average_word_length,
            "num_sentences": num_sentences,
            "average_sentence_length": average_sentence_length,
            "num_different_words": num_different_words,
            "num_of_stopwords": num_of_stopwords,
        }
    )

    syllable_estimates = [syllables.estimate(word) for word in words]
    syllable_count = sum(syllable_estimates)

    # Readability
    flesch_reading_ease = (
        206.835
        - 1.015 * (word_count / num_sentences)
        - 84.6 * (syllable_count / word_count)
    )
    flesch_kincaid_grade_level = (
        0.39 * (word_count / num_sentences)
        + 11.8 * (syllable_count / word_count)
        - 15.59
    )

    difficult_word_count = len([word for word in words if word not in dalle_list])
    dalle_chall_readability = 0.1579 * (
        difficult_word_count / word_count * 100
    ) + 0.0496 * (word_count / num_sentences)
    # Automated readability index
    ari = (
        4.71 * (num_characters / word_count)
        + 0.5 * (word_count / num_sentences)
        - 21.43
    )

    group_t = sentences[0:10]
    group_m = sentences[len(sentences) // 2 - 5 : len(sentences) // 2 + 5]
    group_b = sentences[-10:]
    nsw = 0
    for sent in group_t + group_m + group_b:
        for word in sent:
            if syllables.estimate(word) >= 3:
                nsw += 1

    smog = 1.043 * np.sqrt(nsw) * 30 / len(sentences) + 3.1291

    # LIX
    B = len([w for w in tokens if w[0].isupper() or len(w) == 1])
    C = len([w for w in words if len(w) > 6])
    lix = word_count / B + (C * 100) / word_count

    wvi = np.log(word_count) / np.log(
        2 - np.log(num_different_words) / np.log(word_count)
    )
    gunning_fog_index = 0.4 * ((word_count / num_sentences) + 100 * (nsw / word_count))
    

    features.update(
        {
            "flesch_reading_ease": flesch_reading_ease,
            "flesch_kincaid_grade_level": flesch_kincaid_grade_level,
            "dalle_chall_readability": dalle_chall_readability,
            "ari": ari,
            "smog": smog,
            "lix": lix,
            "wvi": wvi,
            "gunning_fog_index": gunning_fog_index,
        }
    )

    # Lexical diversity
    ttr = num_different_words / word_count
    rs, word_counts = defaultdict(int), defaultdict(int)
    for word in words:
        word_counts[word] += 1
    for _, r in word_counts.items():
        rs[r] += 1

    yule_k = 1e4 * (sum(r**2 * vr for r, vr in rs.items()) - word_count) / word_count**2

    min_range, max_range, trials = 35, 50, 5
    ns = np.arange(min_range, max_range + 1)
    ttrs = []
    for idx, sample_size in enumerate(ns):
        ttr = 0
        if sample_size <= len(words):
            for trial in range(trials):
                word_list = np.random.choice(words, sample_size, replace=False)
                ttr += len(set(word_list)) / len(word_list)
            ttrs.append(ttr / trials)
    ttrs = np.array(ttrs)
    A = np.vstack([2 * (1 - ttrs) / ns[0:len(ttrs)]]).T
    y = ttrs**2
    d = np.linalg.lstsq(A, y, rcond=None)[0]
    d_estimate = d[0]
    hapax_legomena = rs[1]

    guiraud = num_different_words / np.sqrt(word_count)
    advanced_guiraud = difficult_word_count / np.sqrt(word_count)
    features.update(
        {
            "ttr": ttr,
            "yule_k": yule_k,
            "d_estimate": d_estimate,
            "hapax_legomena": rs[1],
            "guiraud": guiraud,
            "advanced_guiraud": advanced_guiraud,
        }
    )

    # POS tags
    pos_tags = defaultdict(int)
    total_pos_tags = 0
    unique_pos_tags = 0
    for _, tag in text_blob.tags:
        if tag not in pos_tags:
            unique_pos_tags += 1
        pos_tags[tag] += 1
        total_pos_tags += 1

    pos_dist = {}
    for tag, count in pos_tags.items():
        pos_dist[tag] = count/total_pos_tags
        
    features["total_pos_tags"] = total_pos_tags
    features["unique_pos_tags"] = unique_pos_tags
    features.update(pos_dist)
    features.update(pos_tags)
    
    # Discourse patterns
    doc = nlp(corrected_text)
    egrid = EntityGrid(doc)
    (
        local_coherence_PU,
        local_coherence_PW,
        local_coherence_PACC,
        local_coherence_PU_dist,
        local_coherence_PW_dist,
        local_coherence_PACC_dist,
    ) = get_local_coherence(egrid)
    
    features.update({
        "ss_transitions": egrid.get_ss_transitions(),
        "so_transitions": egrid.get_so_transitions(),
        "sx_transitions": egrid.get_sx_transitions(),
        "sn_transitions": egrid.get_sn_transitions(),
        "os_transitions": egrid.get_os_transitions(),
        "oo_transitions": egrid.get_oo_transitions(),
        "ox_transitions": egrid.get_ox_transitions(),
        "on_transitions": egrid.get_on_transitions(),
        "xs_transitions": egrid.get_xs_transitions(),
        "xo_transitions": egrid.get_xo_transitions(),
        "xx_transitions": egrid.get_xx_transitions(),
        "xn_transitions": egrid.get_xn_transitions(),
        "ns_transitions": egrid.get_ns_transitions(),
        "no_transitions": egrid.get_no_transitions(),
        "nx_transitions": egrid.get_nx_transitions(),
        "nn_transitions": egrid.get_nn_transitions(),
        "local_coherence_PU": local_coherence_PU,
        "local_coherence_PW": local_coherence_PW,
        "local_coherence_PACC": local_coherence_PACC,
        "local_coherence_PU_dist": local_coherence_PU_dist,
        "local_coherence_PW_dist": local_coherence_PW_dist,
        "local_coherence_PACC_dist": local_coherence_PACC_dist,
    })

    # CVA Features
    
    # Compute weight scores for the essay
    fi = vectorizer.transform([text])
    max_f = fi.max()
    wi = (fi / max_f).toarray() * np.log(n / ni)


    # Maximum similarity to the best score category
    max_sim_best = np.dot(wi, content_vectors[max_rating])/(norm(wi)*norm(content_vectors[max_rating]))
    max_sim_best = max_sim_best[0]

    pattern_cosine = 0
    val_cos = 0
    max_cos = 0
    for i in range(min_rating, max_rating + 1):
        # Similarity between content vectors of category i and the essay vector
        cos = np.dot(wi, content_vectors[i])/(norm(wi)*norm(content_vectors[i]))
        pattern_cosine += i * cos[0]
        
        # This is for the val cosine, e.g. cos_4 + cos_3 - cos_2 - cos_1
        if i < (min(scores) + max(scores)) // 2:
            val_cos -= cos[0]
        else:
            val_cos += cos[0]

        # We are also looking for the score category closest to the essay
        if cos[0] >= max_cos:
            max_cos_val = i
            max_cos = cos[0]
            
    fsource = vectorizer.transform([source_text])
    max_fsource = fsource.max()
    wsource = (fi / max_fsource).toarray() * np.log(n / ni)
    cos_source = np.dot(wi.squeeze(), wsource.squeeze())/(norm(wi)*norm(wsource))
    
    features.update({
        "max_cos_val": max_cos_val,
        "max_sim_best": max_sim_best,
        "pattern_cosine": pattern_cosine,
        "val_cos": val_cos,
        "similarity_source_text": cos_source,
    })
    
    # GPT4 fine-tuned model as a feature
    prompt = prompt + ' ->'
    res = client.completions.create(
        model=ft_model,
        prompt=prompt,
        max_tokens=2, temperature=0)

    gpt_score = int(re.sub(r"[^0-9]", "", res.choices[0].text)) % 10
    features["gpt_score"] = gpt_score
    
# These are experimental, based on spatial measurements
#     dist = cosine_distances(embeddings)
#     nn_distances = np.min(dist + np.diag(np.diag(dist) + 10), axis=1)
#     avg_nn_distance = np.mean(nn_distances)
#     max_nn_distance = np.max(nn_distances)
#     min_nn_distance = np.min(nn_distances)
#     r_distance = 2*np.sqrt(dist.shape[0])*avg_nn_distance
#     cum_freq_dist_nn_dist = np.mean(nn_distances <= avg_nn_distance)
#     givenness = []
#     for i in range(2, len(embeddings)):
#         x = embeddings[0:i]
#         u, s, vh = np.linalg.svd(x)
#         orthonormal_vector = vh[-1]
#         givenness.append(np.dot(embeddings[i], orthonormal_vector))

#     avg_givenness = np.mean(givenness)
#     max_givenness = np.max(givenness)
#     min_givenness = np.min(givenness)
#     givenness_proj = []
#     for i in range(2, len(embeddings)):
#         x = np.array(embeddings[0:i])
#         A = x.T
#         b = embeddings[i]
#         c = np.linalg.lstsq(A, b, rcond=None)[0]
#         bw = A.dot(c)
#         bwo = b - bw
#         N = np.dot(b, bwo)
#         G = np.dot(b, bw)
#         givenness_proj.append(N / (G + N))

#     avg_givenness_proj = np.mean(givenness_proj)
#     min_givenness_proj = np.min(givenness_proj)
#     max_givenness_proj = np.max(givenness_proj)
#     max_min_giv_ratio = max_givenness / min_givenness
#     centroid = np.sum(embeddings, axis=0) / len(embeddings)
#     dist_to_centroid = []
#     for embedding in embeddings:
#         dist_to_centroid.append(np.dot(centroid, embedding))

#     avg_dist_to_centroid = np.mean(dist_to_centroid)
#     max_dist_to_centroid = np.max(dist_to_centroid)
#     min_dist_to_centroid = np.min(dist_to_centroid)
#     std_distance = np.sqrt(np.sum(np.sum(
#         (embeddings - centroid)**2, axis=1))/len(embeddings))
#     relative_distance = std_distance / max_dist_to_centroid
#     det_dist = np.linalg.det(dist)

#     features.update({
#         "avg_nn_distance": avg_nn_distance,
#         "max_nn_distance": max_nn_distance,
#         "min_nn_distance": min_nn_distance,
#         "r_distance": r_distance,
#         "cum_freq_dist_nn_dist": cum_freq_dist_nn_dist,
#         "avg_givenness": avg_givenness,
#         "max_givenness": max_givenness,
#         "min_givenness": min_givenness,
#         "max_min_giv_ratio": max_min_giv_ratio,
#         "avg_givenness_proj": avg_givenness_proj,
#         "min_givenness_proj": min_givenness_proj,
#         "max_givenness_proj": max_givenness_proj,
#         "max_min_giv_ratio": max_min_giv_ratio,
#         "avg_dist_to_centroid": avg_dist_to_centroid,
#         "max_dist_to_centroid": max_dist_to_centroid,
#         "min_dist_to_centroid": min_dist_to_centroid,
#         "std_distance": std_distance,
#         "relative_distance": relative_distance,
#         "det_dist": det_dist,
#     })
    features["score"] = score

    return features

# print(get_features(texts[0], scores[1]))
args = list(zip(texts, scores))
train_result = pqdm(args, get_features, n_jobs=8, argument_type="args")

QUEUEING TASKS | :   0%|          | 0/1800 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1800 [00:00<?, ?it/s]

  wvi = np.log(word_count) / np.log(
  wvi = np.log(word_count) / np.log(
  wvi = np.log(word_count) / np.log(
  wvi = np.log(word_count) / np.log(


COLLECTING RESULTS | :   0%|          | 0/1800 [00:00<?, ?it/s]

In [23]:
train_results = []
for idx, res in enumerate(train_result):
    if type(res) != dict:
        continue
    train_results.append(res)
        
essay_set_features = pd.DataFrame(train_results).fillna(0).replace([np.inf, -np.inf], np.nan).dropna()

In [24]:
X, y = essay_set_features.drop(["score", "gpt_score"], axis=1), essay_set_features.score

In [25]:
clf = RandomForestRegressor(
    n_estimators=200, max_features=1/3, max_depth=12)

In [26]:
def essay_metrics(clf, X, y_true):
    y_pred = np.clip(np.round(clf.predict(X)), min_rating, max_rating)
    qwk = quadratic_weighted_kappa(y_true, y_pred, min_rating=min_rating, max_rating=max_rating)
    ea = np.sum(y_true == y_pred) / y_true.shape[0]
    aa = np.sum((y_true - y_pred) <= 1) / y_true.shape[0]
    return {
        "qwk": qwk,
        "ea": ea,
        "aa": aa
    }

In [27]:
assert min_rating == y.min()
assert max_rating == y.max()

# DP Model Alone

In [28]:
from sklearn.model_selection import cross_validate

results = cross_validate(clf, X, y, scoring=essay_metrics, cv=10)

In [29]:
np.max(results["test_qwk"]), np.max(results["test_ea"]), np.max(results["test_aa"])

(0.8405458609989657, 0.7570621468926554, 1.0)

# GPT Model Alone

In [30]:
y_pred = essay_set_features.gpt_score
y_true = essay_set_features.score
qwk = quadratic_weighted_kappa(y_true, y_pred, min_rating=min_rating, max_rating=max_rating)
ea = np.sum(y_true == y_pred) / y_true.shape[0]
aa = np.sum((y_true - y_pred) <= 1) / y_true.shape[0]
qwk, ea, aa

(0.7810732366443675, 0.6841511562323745, 0.9966159052453468)

# Hybrid Model

In [31]:
X, y = essay_set_features.drop(["score"], axis=1), essay_set_features.score
clf = RandomForestRegressor(
    n_estimators=200, max_features=1/3, max_depth=12)
results = cross_validate(clf, X, y, scoring=essay_metrics, cv=10)
np.max(results["test_qwk"]), np.max(results["test_ea"]), np.max(results["test_aa"])

(0.8330096009767467, 0.7288135593220338, 1.0)