In [1]:
import numpy as np
import pandas as pd

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, DistilBertTokenizer, DistilBertModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, metric
import evaluate

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

try:
    torch.cuda.empty_cache()
except:
    pass

#torch.set_float32_matmul_precision('high')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class SentenceEmbedder():
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model     = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)
        self.model     = torch.compile(self.model, mode="reduce-overhead")

    # Sentence Embedding
    # https://www.sbert.net/examples/applications/computing-embeddings/README.html                                                             
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def extract_token_indices(self, input_ids, original_text, start_idx, end_idx):
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

        start_token_idx, end_token_idx = None, len(tokens)
        cursor = 0
        for i, token in enumerate(tokens):
            if token in ["[CLS]", "[SEP]", "[PAD]"]:
                continue

            cursor_after_token = cursor + len(token.replace("##", ""))
            while cursor_after_token < len(original_text) and original_text[cursor_after_token].isspace():
                cursor_after_token += 1
                
            if cursor >= start_idx and start_token_idx is None:
                start_token_idx = i
            if cursor_after_token >= end_idx:
                end_token_idx = i + 1
                break
            
            cursor = cursor_after_token

        assert start_token_idx and end_token_idx
        return start_token_idx, end_token_idx
    
    def get_sentence_embeddings(self, sentences):
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self.mean_pooling(model_output[0], encoded_input['attention_mask'])
            return sentence_embeddings
    
    def get_sentence_embeddings_focus_on_substring(self, sentences, indices):
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        assert model_output["last_hidden_state"].shape[0] == len(sentences)

        final_sentence_embeddings = []
        for i in range(len(sentences)):
            token_idx_start, token_idx_end = self.extract_token_indices(
                encoded_input["input_ids"][i], sentences[i], indices[i][0], indices[i][1]
            )
            token_embeddings = model_output[0][i, token_idx_start:token_idx_end]
            attention_mask   = encoded_input['attention_mask'][i, token_idx_start:token_idx_end]
            sentence_embeddings = self.mean_pooling(
                torch.unsqueeze(token_embeddings, 0), 
                torch.unsqueeze(attention_mask,   0)
            )
            final_sentence_embeddings.append(torch.squeeze(sentence_embeddings, 0))

        return torch.stack(final_sentence_embeddings)


In [3]:
def make_base_dataset():
    dataset = load_dataset("humicroedit", "subtask-1")

    train_df = pd.DataFrame(dataset["train"])
    val_df   = pd.DataFrame(dataset["validation"])
    test_df  = pd.DataFrame(dataset["test"])

    dfs = [train_df, test_df, val_df]
    for df in dfs:
        def edit_the_headline(original, edit):
            openIdx  = original.index("<")
            closeIdx = original.index("/>") + len("/>")
            return original[:openIdx] + edit + original[closeIdx:]
        
        df["original_sentence"] = df["original"].apply(lambda s: s.replace("<", "").replace("/>", ""))
        df["edited_sentence"]   = df.apply(lambda row: edit_the_headline(row["original"], row["edit"]), axis=1)

        df["original_word_start_idx"] = df["original"].apply(lambda s: s.index("<"))        
        df["original_word_end_idx"]   = df["original"].apply(lambda s: s.index("/>") - 1)

        df["edited_word_start_idx"] = df["original"].apply(lambda s: s.index("<"))
        df["edited_word_end_idx"]   = df.apply(lambda row: row["edited_word_start_idx"] + len(row["edit"]), axis=1)

        df["all_scores"]       = df["grades"].apply(lambda s: sorted([int(c) for c in s]))
        df["normalized_score"] = df["meanGrade"] / 3.0

    return train_df, val_df, test_df

# Tensor of shape (row_count, 2, embedding_dimensions).
# dim=1 signifies the original and the edited sentence (0 original, 1 edited).
def precompute_embeddings(df, embedder, compute_embeddings_batch_size=None, device="cpu"):
    original_embeddings = []
    edited_embeddings   = []
    
    if compute_embeddings_batch_size is None:
        compute_embeddings_batch_size = len(df.index)

    for i in range(0, len(df.index), compute_embeddings_batch_size):
        rows = df[i:i + compute_embeddings_batch_size]
        original_embeddings.append(embedder.get_sentence_embeddings(rows["original_sentence"].to_list()).to(device))
        edited_embeddings  .append(embedder.get_sentence_embeddings(rows["edited_sentence"]  .to_list()).to(device))
        
    return torch.stack((
        torch.cat(original_embeddings), 
        torch.cat(edited_embeddings)
    ), dim=1)

def precompute_embeddings_focus_on_edit_word(df, embedder, compute_embeddings_batch_size=None, device="cpu"):
    original_embeddings = []
    edited_embeddings   = []
    
    if compute_embeddings_batch_size is None:
        compute_embeddings_batch_size = len(df.index)
        
    for i in range(0, len(df.index), compute_embeddings_batch_size):
        rows = df[i:i + compute_embeddings_batch_size]

        original = embedder.get_sentence_embeddings_focus_on_substring(
            list(rows["original_sentence"]),
            list(zip(rows["original_word_start_idx"], rows["original_word_end_idx"]))
        )
        edited = embedder.get_sentence_embeddings_focus_on_substring(
            list(rows["edited_sentence"]),
            list(zip(rows["edited_word_start_idx"], rows["edited_word_end_idx"]))
        )

        original_embeddings.append(original.to(device))
        edited_embeddings  .append(edited  .to(device))
        
    return torch.stack((
        torch.cat(original_embeddings), 
        torch.cat(edited_embeddings)
    ), dim=1)

class SentenceEmbeddingsDataset(Dataset):
    def __init__(self, dataframe, precomputed_embeddings, device="cpu"):
        self.df         = dataframe
        self.embeddings = precomputed_embeddings
        print(f"{self.embeddings.shape=}")

        score_counts = torch.zeros(len(self.df.index), 4)
        for i, scores in self.df["all_scores"].items():
            for score in scores:
                score_counts[i, score] += 1
        self.score_counts = score_counts.to(device)

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.score_counts[idx]
    
class TokenizedSentencesDataset(Dataset):
    def __init__(self, dataframe, device="cpu"):
        self.df = dataframe
        
        score_counts = torch.zeros(len(self.df.index), 4)
        for i, scores in self.df["all_scores"].items():
            for score in scores:
                score_counts[i, score] += 1
        self.score_counts = score_counts.to(device)

        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
        output = tokenizer(
            self.df["original_sentence"].tolist(),
            self.df["edited_sentence"].tolist(), 
            return_tensors='pt', truncation=True, padding=True
        ).to(device)
        
        self.input_ids      = output["input_ids"]
        self.attention_mask = output["attention_mask"]

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }, self.score_counts[idx]

In [4]:
cached_sentence_embedder = SentenceEmbedder()
cached_base_dataset = make_base_dataset()

RuntimeError: Windows not yet supported for torch.compile

In [None]:
cached_dataset_naive_sentence_embeddings = [
    SentenceEmbeddingsDataset(
        df, 
        precompute_embeddings(df, cached_sentence_embedder, compute_embeddings_batch_size=512, device=device),
        device=device
    )
    for df in cached_base_dataset
]

In [None]:
cached_dataset_edited_words_embeddings = [
    SentenceEmbeddingsDataset(
        df, 
        precompute_embeddings_focus_on_edit_word(df, cached_sentence_embedder, compute_embeddings_batch_size=512, device=device),
        device=device
    )
    for df in cached_base_dataset
]

In [None]:
cached_dataset_tokenized_sentences = [TokenizedSentencesDataset(df, device=device) for df in cached_base_dataset]

In [None]:
def experiment_predict_mean_score(train_dataset, val_dataset, test_dataset):
    batch_size   = 2048
    num_epochs   = 200
    lr           = 1e-4
    weight_decay = 0


    score_weights = torch.arange(0, 4, dtype=torch.float, device=device).unsqueeze(1)
    def reshape_batch(batch):
        inputs, labels = batch
        inputs = torch.flatten(inputs, start_dim=-2)
        labels = labels / torch.sum(labels, dim=-1, keepdim=True)
        actual_mean_score = torch.squeeze(labels @ score_weights) / 3
        return inputs, actual_mean_score
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

    one_embedding, _ = train_dataset[0]
    input_features = torch.numel(one_embedding)

    layers = []
    #while input_features > 50:
    #    layers.append(nn.Linear(input_features, input_features // 2))
    #    layers.append(nn.Dropout1d(0.1))
    #    layers.append(nn.Tanh())
    #    input_features //= 2
    
    layers.append(nn.Dropout(0.3))
    layers.append(nn.Linear(input_features, 10))
    layers.append(nn.Tanh())
    layers.append(nn.Linear(10, 1))
    #layers.append(nn.Sigmoid())

    model = nn.Sequential(*layers)
    model = model.to(device)
    model = torch.compile(model)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    print(model)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = reshape_batch(data)
            labels *= 3
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i, data in enumerate(val_loader, 0):
                inputs, labels = reshape_batch(data)
                labels *= 3
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")
            print(f"Epoch {epoch+1}, RMSE: {np.sqrt(running_loss/len(train_loader))}, Val RMSE: {np.sqrt(val_loss/len(val_loader))}")

    # Evaluation
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            labels *= 3
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    print(f"Test Loss: {test_loss/len(test_loader)}")
    print(f"Test RMSE: {np.sqrt(test_loss/len(test_loader))}")

    # Excel output
    predicted_scores = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            labels *= 3
            outputs = model(inputs).squeeze()
            predicted_scores.extend(outputs.tolist())
            
    excel_df = test_dataset.df.copy()
    excel_df["normalized_predicted_score"] = predicted_scores
    excel_df.to_excel("experiment_predict_mean_score.xlsx")


torch.cuda.empty_cache()
torch._dynamo.reset()

experiment_predict_mean_score(*cached_dataset_naive_sentence_embeddings)
#experiment_predict_mean_score(*cached_dataset_edited_words_embeddings)

In [None]:
def experiment_predict_score_distribution(train_dataset, val_dataset, test_dataset):
    batch_size   = 2048
    num_epochs   = 500
    lr           = 1e-4
    weight_decay = 0

    def reshape_batch(batch):
        inputs, labels = batch
        inputs = torch.flatten(inputs, start_dim=-2)
        labels = labels / torch.sum(labels, dim=-1, keepdim=True)
        return inputs, labels
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

    one_embedding, _ = train_dataset[0]
    input_features = torch.numel(one_embedding)

    layers = []
    #while input_features > 50:
    #    layers.append(nn.Linear(input_features, input_features // 2))
    #    layers.append(nn.Dropout1d(0.1))
    #    layers.append(nn.Tanh())
    #    input_features //= 2
    
    layers.append(nn.Dropout(0.3))
    layers.append(nn.Linear(input_features, 4))
    #layers.append(nn.Tanh())
    #layers.append(nn.Linear(10, 4)) # logits

    model = nn.Sequential(*layers)
    model = torch.compile(model.to(device))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    print(model)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = reshape_batch(data)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        if epoch % 10 == 0:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for i, data in enumerate(val_loader, 0):
                    inputs, labels = reshape_batch(data)
                    outputs = model(inputs).squeeze()
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()

            print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

    # Evaluation
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    print(f"Test Loss: {test_loss/len(test_loader)}")

    # Excel output
    predicted_scores = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            logits = model(inputs).squeeze()

            p = torch.softmax(logits, dim=1)
            score_weights = torch.arange(0, 4, dtype=torch.float, device=device).unsqueeze(1)
            mean_score = torch.squeeze(p @ score_weights) / 3
            predicted_scores.extend(mean_score.tolist())
            
    excel_df = test_dataset.df.copy()
    excel_df["normalized_predicted_score"] = predicted_scores
    excel_df.to_excel("experiment_predict_score_distribution.xlsx")


torch.cuda.empty_cache()
torch._dynamo.reset()

#experiment_predict_score_distribution(*cached_dataset_edited_words_embeddings)
experiment_predict_score_distribution(*cached_dataset_naive_sentence_embeddings)

In [None]:
def experiment_predict_score_distribution_finetune_distillbert(train_dataset, val_dataset, test_dataset):
    batch_size   = 16 
    num_epochs   = 1000
    lr           = 1e-4
    lr_bert      = 3e-5
    weight_decay = 0

    def reshape_batch(batch):
        inputs, labels = batch
        labels = labels / torch.sum(labels, dim=-1, keepdim=True)
        return inputs, labels

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

    distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
    distilbert = distilbert.to(device)
    #distilbert = torch.compile(distilbert)

    for param in distilbert.parameters():
        param.requires_grad = False
    
    layers = [
              nn.Linear(768, 768),
              nn.Dropout1d(0.3),
              nn.Tanh(),
              nn.Linear(768, 4)] # logits

    model = nn.Sequential(*layers)
    model = torch.compile(model.to(device))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam([
        { "params": distilbert.parameters(), "lr": lr_bert },
        { "params": model.parameters(),      "lr": lr, "weight_decay": weight_decay },
    ])

    print(model)

    def logits_from_inputs(distilbert, model, inputs):
        outputs = distilbert(**inputs).last_hidden_state[:, 0, :]
        logits = model(outputs).squeeze()
        return logits
        
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = reshape_batch(data)
            logits = logits_from_inputs(distilbert, model, inputs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        if epoch % 1 == 0:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for i, data in enumerate(val_loader, 0):
                    inputs, labels = reshape_batch(data)
                    logits = logits_from_inputs(distilbert, model, inputs)
                    loss = criterion(logits, labels)
                    val_loss += loss.item()

            print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

    # Evaluation
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            logits = logits_from_inputs(distilbert, model, inputs)
            loss = criterion(logits, labels)
            test_loss += loss.item()
    print(f"Test Loss: {test_loss/len(test_loader)}")

    # evaluate the model and print the RMS loss on the test set
    model.eval()
    mse_loss = torch.nn.MSELoss()
    with torch.no_grad():
        predictions = []
        actuals = []
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            logits = logits_from_inputs(distilbert, model, inputs)
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            actuals.extend(torch.argmax(labels, dim=1).tolist())
        print(f"Test Accuracy: {metric.compute(predictions=predictions, references=actuals)}")

    # Excel output
    predicted_scores = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            inputs, labels = reshape_batch(data)
            logits = logits_from_inputs(distilbert, model, inputs)

            p = torch.softmax(logits, dim=1)
            score_weights = torch.arange(0, 4, dtype=torch.float, device=device).unsqueeze(1)
            mean_score = torch.squeeze(p @ score_weights) / 3
            predicted_scores.extend(mean_score.tolist())
            
    excel_df = test_dataset.df.copy()
    excel_df["normalized_predicted_score"] = predicted_scores
    excel_df.to_excel("experiment_predict_score_distribution_finetune_distillbert.xlsx")


torch.cuda.empty_cache()
torch._dynamo.reset()

experiment_predict_score_distribution_finetune_distillbert(*cached_dataset_tokenized_sentences)

In [None]:
def experiment_predict_mean_score_finetune_distilbert_huggingface(train_df, val_df, test_df):
    train_dataset = TokenizedSentencesDataset(train_df, device="cpu")
    val_dataset   = TokenizedSentencesDataset(val_df,   device="cpu")
    test_dataset  = TokenizedSentencesDataset(test_df,  device="cpu")

    batch_size   = 16
    num_epochs   = 4
    lr           = 5e-5
    weight_decay = 0.0001

    class NormalizedLabelsDataset(Dataset):
        def __init__(self, base_dataset):
            self.base_dataset = base_dataset
        def __len__(self):
            return len(self.base_dataset)
        def __getitem__(self, idx):
            inputs, labels = self.base_dataset[idx]
            labels = labels / torch.sum(labels, dim=-1, keepdim=True)
            score_weights     = torch.arange(0, 4, dtype=torch.float).unsqueeze(1)
            actual_mean_score = torch.squeeze(labels @ score_weights) / 3
            return { "labels": actual_mean_score, **inputs }

    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
    #model = torch.compile(model.to(device))

    args = TrainingArguments(
        "distilbert-base-uncased-finetuned-subtask-1",
        evaluation_strategy = "steps",
        eval_steps=200,
        save_strategy = "steps",
        save_steps=200,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="mse",
    )

    mae_metric = evaluate.load("mae")
    mse_metric = evaluate.load("mse")
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions, labels = torch.tensor(predictions),  torch.tensor(labels)
        predictions, labels = torch.squeeze(predictions), torch.squeeze(labels)
        predictions *= 3
        labels      *= 3
        
        #p = torch.softmax(predictions, dim=1)
        #score_weights = torch.arange(0, 4, dtype=torch.float).unsqueeze(1)
        #predicted_mean_score = torch.squeeze(p      @ score_weights) / 3
        #actual_mean_score    = torch.squeeze(labels @ score_weights) / 3
        #return metric.compute(predictions=predicted_mean_score, references=actual_mean_score)
        mse = mse_metric.compute(predictions=predictions, references=labels)
        mae = mae_metric.compute(predictions=predictions, references=labels)
        return {
            "mae":  mae["mae"],
            "mse":  mse["mse"],
            "rmse": np.sqrt(mse["mse"]),
        }

    trainer = Trainer(
        model,
        args,
        train_dataset=NormalizedLabelsDataset(train_dataset),
        eval_dataset=NormalizedLabelsDataset(val_dataset),
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.evaluate()
    out = trainer.predict(NormalizedLabelsDataset(test_dataset))
    predictions = np.squeeze(out.predictions)
    # compute metrics
    predictions *= 3
    

    excel_df = test_dataset.df.copy()
    excel_df["normalized_predicted_score"] = predictions
    excel_df.to_excel("experiment_predict_mean_score_finetune_distillbert.xlsx")


torch.cuda.empty_cache()
torch._dynamo.reset()

experiment_predict_mean_score_finetune_distilbert_huggingface(*cached_base_dataset)

In [None]:
from collections import defaultdict

import openai
openai.api_key = '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'

from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from torch.utils.data import DataLoader


def make_base_dataset():
    dataset = load_dataset("humicroedit", "subtask-1")

    train_df = pd.DataFrame(dataset["train"])
    val_df   = pd.DataFrame(dataset["validation"])
    test_df  = pd.DataFrame(dataset["test"])

    dfs = [train_df, test_df, val_df]
    for df in dfs:
        def edit_the_headline(original, edit):
            openIdx  = original.index("<")
            closeIdx = original.index("/>") + len("/>")
            return original[:openIdx] + edit + original[closeIdx:]
        
        df["original_sentence"] = df["original"].apply(lambda s: s.replace("<", "").replace("/>", ""))
        df["edited_sentence"]   = df.apply(lambda row: edit_the_headline(row["original"], row["edit"]), axis=1)

        df["original_word_start_idx"] = df["original"].apply(lambda s: s.index("<"))        
        df["original_word_end_idx"]   = df["original"].apply(lambda s: s.index("/>") - 1)

        df["edited_word_start_idx"] = df["original"].apply(lambda s: s.index("<"))
        df["edited_word_end_idx"]   = df.apply(lambda row: row["edited_word_start_idx"] + len(row["edit"]), axis=1)

        df["all_scores"]       = df["grades"].apply(lambda s: sorted([int(c) for c in s]))
        df["normalized_score"] = df["meanGrade"] / 3.0

    return train_df, val_df, test_df

model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
    InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
    InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.ContrastiveLoss(model=model)

model.fit([(train_dataloader, train_loss)], show_progress_bar=True)

In [None]:
from collections import defaultdict
import json
import time
import requests
import openai
openai.api_key = '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'

train_df = cached_base_dataset[0]
# random sample of 10 from train_df

top_df = train_df.sort_values(by=["meanGrade"], ascending=True).head(50).sample(5)
bot_df = train_df.sort_values(by=["meanGrade"], ascending=False).head(50).sample(5)

a = list(train_df["original"].sample(10)) + list(top_df["original"]) + list(bot_df["original"])
b = list(train_df["edit"].sample(10))     + list(top_df["edit"])     + list(bot_df["edit"])
for aa, bb in zip(a, b):
    aa = aa.replace("<", "[ ").replace("/>", f" => {bb} ]")
    print(aa)

def generate_explanations(df, filename, completions_per_headline=1):
    prompt = """
        The following news headlines have been edited to be more humorous.
        The format of the headline is "text text [[ original word => edited word ]] text text".
        Explain what kind of humorous response the edit wanted to elicit, and wether it suceeeded or fell flat.
        You are not to be too easily offended. Answer as concisely as possible. When explaining something refer to the exact part in the headline.
        Do not use more than 3 sentences. Only output the explanation, nothing else.

        Headline:
        REPLACE_WITH_HEADLINE
    """

    completions = defaultdict(list)
    for i in range(len(df.index)):
        headline = df.iloc[i]
        original = headline["original"]
        edit     = headline["edit"]
        combined = original.replace("<", "[[ ").replace("/>", f" => {edit} ]]")
        
        this_prompt = prompt.replace("REPLACE_WITH_HEADLINE", combined)

        print("generating", i, "/", len(df.index))

        response = None
        while True:
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        { "role": "user", "content": this_prompt }
                    ],
                    temperature=0.7,
                    max_tokens=128,
                    n=completions_per_headline
                )
                break
            except Exception as e:
                print(e)
                print("retrying...")
                time.sleep(1)

        for choice in response["choices"]:
            completions[original].append(choice["message"]["content"])

    with open(filename, "w") as f:
        json.dump(completions, f, indent=4)

    return completions

def generate_explanations2(df, filename, completions_per_headline=1):
    prompt = """
        The following news headlines have been edited to be more humorous.
        The format of the headline is "text text [[ original word => edited word ]] text text".
        Explain what kind of humorous response the edit wanted to elicit, and wether it suceeeded or fell flat.
        You are not to be too easily offended. Answer as concisely as possible. When explaining something refer to the exact part in the headline.
        Do not use more than 3 sentences. Only output the explanation, nothing else.

        Headline:
        REPLACE_WITH_HEADLINE
    """

    completions = defaultdict(list)
    for i in range(len(df.index)):
        headline = df.iloc[i]
        original = headline["original"]
        edit     = headline["edit"]
        combined = original.replace("<", "[[ ").replace("/>", f" => {edit} ]]")
        
        this_prompt = prompt.replace("REPLACE_WITH_HEADLINE", combined)

        print("generating", i, "/", len(df.index))

        request = {
            'user_input': this_prompt,
            'history': {'internal': [], 'visible': []},
            'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
            'character': 'Example',
            'instruction_template': 'Vicuna-v1.1',

            'regenerate': False,
            '_continue': False,
            'stop_at_newline': True,
            'chat_prompt_size': 2048,
            'chat_generation_attempts': 1,
            'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',

            'max_new_tokens': 250,
            'do_sample': True,
            'temperature': 0.7,
            'top_p': 0.1,
            'typical_p': 1,
            'repetition_penalty': 1.18,
            'top_k': 40,
            'min_length': 0,
            'no_repeat_ngram_size': 0,
            'num_beams': 1,
            'penalty_alpha': 0,
            'length_penalty': 1,
            'early_stopping': True,
            'seed': -1,
            'add_bos_token': True,
            'truncation_length': 2048,
            'ban_eos_token': False,
            'skip_special_tokens': True,
            'stopping_strings': []
        }

        response = requests.post("http://127.0.0.1:5000/api/v1/chat", json=request)
        print(response.status_code)
        if response.status_code == 200:
            result = response.json()['results'][0]['history']
            print(json.dumps(result, indent=4))
            print()
            print(result['visible'][-1][1])
            completions[original].append(result['visible'][-1][1])

    with open(filename, "w") as f:
        json.dump(completions, f, indent=4)

    return completions


x = generate_explanations2(cached_base_dataset[0], "explanations_train.json", 1)
print(x)


James Comey set to break another oath by sharing fictional [ memos => kale ] to Congress on May 30th
[ Trump => Presidents ] Makes Headlines With Fox News Interview
Pelosi : Trump ’s insecurity fueling [ fraud => Cookie ] investigation
Inside secret court hearing in Mueller 's Trump-Russia [ probe => embrace ] 
Some U.S. [ borrowers => energy ] jailed over civil debts , new ACLU report shows
More than 50 detained in immigration raids at Asian [ restaurants => space ] in Mississippi
Violent protests between Pro and Anti [ Trump => nugget ] folks in Berkeley
An American [ Journalist => voters ] Is Facing A Felony Trial This Week — In The United States
Trump Swaps His Beloved Burgers for Salads and Soups in New [ Diet => cake ] 
American CEOs send letter to House : Kill the ' made in [ America => shade ] ' tax
Carnage in [ Kabul => butchery ] adds to US challenges in Afghanistan
Democratic Lawmakers Sue [ Trump => Geyser ] , Handing The President Another Legal Challenge
Democratic divisio

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
with open("completions_1703.json", "w") as f:
    json.dump(completions, f, indent=4)