In [1]:
from typing import List
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import syllables
from collections import defaultdict
from pqdm.processes import pqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
import re
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import pandas as pd
import os
import spacy
from numpy.linalg import norm
from tqdm import tqdm

import os
from openai import OpenAI


client = OpenAI(api_key=os.environ["OPEN_AI_API_KEY"])

# Define QWK Metric functions

In [2]:
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the quadratic weighted kappa
    quadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the linear weighted kappa
    linear_weighted_kappa calculates the linear weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    linear_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = abs(i - j) / float(num_ratings - 1)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the kappa
    kappa calculates the kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.

    kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.

    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.

    kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            if i == j:
                d = 0.0
            else:
                d = 1.0
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator


def mean_quadratic_weighted_kappa(kappas, weights=None):
    """
    Calculates the mean of the quadratic
    weighted kappas after applying Fisher's r-to-z transform, which is
    approximately a variance-stabilizing transformation.  This
    transformation is undefined if one of the kappas is 1.0, so all kappa
    values are capped in the range (-0.999, 0.999).  The reverse
    transformation is then applied before returning the result.

    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
    kappa values

    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
    of weights that is the same size as kappas.  Weights are applied in the
    z-space
    """
    kappas = np.array(kappas, dtype=float)
    if weights is None:
        weights = np.ones(np.shape(kappas))
    else:
        weights = weights / np.mean(weights)

    # ensure that kappas are in the range [-.999, .999]
    kappas = np.array([min(x, .999) for x in kappas])
    kappas = np.array([max(x, -.999) for x in kappas])

    z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights
    z = np.mean(z)
    return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)


def weighted_mean_quadratic_weighted_kappa(solution, submission):
    predicted_score = submission[submission.columns[-1]].copy()
    predicted_score.name = "predicted_score"
    if predicted_score.index[0] == 0:
        predicted_score = predicted_score[:len(solution)]
        predicted_score.index = solution.index
    combined = solution.join(predicted_score, how="left")
    groups = combined.groupby(by="essay_set")
    kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups]
    weights = [group[1]["essay_weight"].irow(0) for group in groups]
    return mean_quadratic_weighted_kappa(kappas, weights=weights)

In [3]:
df = pd.read_csv(
    "./training_set_rel3.tsv",
    sep="\t",
    encoding="ISO-8859-1",
    usecols=["essay_id", "essay_set", "essay", "domain1_score", "domain2_score"],
)

In [5]:
from collections import Counter

essay_set_intervals = {}
for essay_set in range(1, 9):
    temp = df[df.essay_set == essay_set]
    essay_set_intervals[essay_set] = temp.domain1_score.min(), temp.domain1_score.max()
    
essay_set_intervals

{1: (2, 12),
 2: (1, 6),
 3: (0, 3),
 4: (0, 3),
 5: (0, 4),
 6: (0, 4),
 7: (2, 24),
 8: (10, 60)}

In [13]:
def normalize(row):
    target_lb = 0
    target_ub = 3
    essay_set = row["essay_set"]
    a, b = essay_set_intervals[essay_set]
    return np.clip(
        np.round(
            target_lb + (target_ub - target_lb) / (b - a) * (row["domain1_score"] - a)
        ),
        target_lb,
        target_ub,
    )

In [14]:
df["normalized_score"] = df.apply(normalize, axis=1)

In [15]:
min_rating, max_rating = int(df.normalized_score.min()), int(df.normalized_score.max())
min_rating, max_rating

(0, 3)

In [17]:
from collections import Counter

Counter(df.normalized_score)

Counter({2.0: 7414, 1.0: 3351, 3.0: 1746, 0.0: 465})

In [54]:
df_ft = pd.DataFrame({
    "prompt": df.essay.tolist(),
    "completion": df.normalized_score.tolist()})

df_ft.to_json(f"essay_set_full.jsonl", orient='records', lines=True)

In [55]:
df_ft.head()

Unnamed: 0,prompt,completion
0,"Dear local newspaper, I think effects computer...",2.0
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",2.0
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",2.0
3,"Dear Local Newspaper, @CAPS1 I have found that...",2.0
4,"Dear @LOCATION1, I know having computers has a...",2.0


# Generate data for Fine Tuning

We run the following tool:

`openai tools fine_tunes.prepare_data -f essay_set_full.jsonl -q`

This will generate two data splits for the fine-tuning. One for training and other for validation

In [56]:
train_file = client.files.create(file=open(f"essay_set_full_prepared_train.jsonl", "rb"), purpose="fine-tune")
valid_file = client.files.create(file=open(f"essay_set_full_prepared_valid.jsonl", "rb"), purpose="fine-tune")
fine_tuning_job = client.fine_tuning.jobs.create(training_file=train_file.id, validation_file=valid_file.id, model="babbage-002")
print(fine_tuning_job)

FineTuningJob(id='ftjob-050YvpQyqalHUUFDGaMXGieC', created_at=1721597975, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='babbage-002', object='fine_tuning.job', organization_id='org-JihYzTh2GjJjoPtZZ0kQdsbr', result_files=[], seed=1736061865, status='validating_files', trained_tokens=None, training_file='file-S50elqSBviOxvvpFSfKtRnf7', validation_file='file-CyOBDSWb5QkysorcyzgF840B', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [57]:
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
ft_model = fine_tune_results.fine_tuned_model

In [58]:
test_df = df.sample(int(df.shape[0] * 0.3))
test_df.essay_set.value_counts()

5    556
2    546
4    528
6    526
1    524
3    523
7    459
8    230
Name: essay_set, dtype: int64

In [59]:
texts, scores = test_df.essay.tolist(), test_df.normalized_score.tolist()
texts[0], scores[0]

("There a lot of things to life and sometimes we do need as much help as we can get. Sometimes people will do anything to get help or go anywhere to find it. Because when people feel lost it's the most scraest thing to a person. So they turn to reading books, listening to music, watching movies, also looking and reading magazins. But although thats not the best way you can go there times when you need to listen to your soul the most.     Now there was a time when know one knew how to read or even was a loud to read. But now adays reading as taking over. If you can read you can do anything , you could be the one to run to whole world one day if you read things right.      Books and magazins there are is not a big different between the two. Books can take you mind off some where that you  have never seen before and places like the wild, @LOCATION2, @LOCATION1, @CAPS1's, and high up and trees. It teaches you @LOCATION1 thing  but not every thing it teaches is good. Books can lie about wha

In [60]:
def get_scores(text: str, score: int):

    prompt = text 
    # GPT4 fine-tuned model as a feature
    prompt = prompt + '\n\n###\n\n'
    res = client.completions.create(
        model=ft_model,
        prompt=prompt,
        max_tokens=2, temperature=0)

    features = {}
    gpt_score = int(re.sub(r"[^0-9]", "", res.choices[0].text)) % 10
    features["gpt_score"] = gpt_score
    features["score"] = score

    return features

# print(get_scores(texts[0], scores[0]))
args = list(zip(texts, scores))
train_result = pqdm(args, get_scores, n_jobs=8, argument_type="args")

QUEUEING TASKS | :   0%|          | 0/3892 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/3892 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/3892 [00:00<?, ?it/s]

In [61]:
results = []
for res in train_result:
    if type(res) != dict:
        continue
    results.append(res)

In [62]:
essay_set = pd.DataFrame(results)
essay_set.head()

Unnamed: 0,gpt_score,score
0,1,1.0
1,1,2.0
2,1,1.0
3,2,2.0
4,1,1.0


# GPT Performance GLOBAL

In [64]:
y_pred = essay_set.gpt_score
y_true = essay_set.score
qwk = quadratic_weighted_kappa(y_true, y_pred, min_rating=min_rating, max_rating=max_rating)
ea = np.sum(y_true == y_pred) / y_true.shape[0]
aa = np.sum((y_true - y_pred) <= 1) / y_true.shape[0]
qwk, ea, aa

(0.8320205990645835, 0.8391167192429022, 0.998422712933754)