In [1]:
from tqdm.notebook import tqdm_notebook
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os


# enable progress bar functionality
tqdm_notebook().pandas()

DATA_DIR = os.path.join("..", "data")
OUTPUT_DIR = "..\output"
INTERMEDIATE_DIR = os.path.join("..", "intermediate")

0it [00:00, ?it/s]

In [36]:
df = pd.read_csv(os.path.join(INTERMEDIATE_DIR, "loov_input_data.csv"))
df = df.drop("Unnamed: 0", axis=1)
df.id.astype(int)
df

ValueError: could not convert string to float: '0059830c'

In [3]:
df.dtypes

id           object
text         object
prompt_id     int64
generated     int64
llm          object
source       object
embedding    object
dtype: object

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


#https://aclanthology.org/2020.aespen-1.6.pdf
vectorizer = TfidfVectorizer(strip_accents="unicode",
                             ngram_range=(3,5), 
                             max_df=0.9, 
                             min_df=0.05)
vectorizer.fit(df.text)

In [5]:
import skops.io as sio


file = os.path.join(INTERMEDIATE_DIR, "best_model.skops")
best_model = sio.load(file, trusted=True)
best_model

In [30]:
from sklearn.base import clone


def validate(essay_id):
    essay_train = df[~df.id.eq(essay_id)]
    essay_test = df[df.id.eq(essay_id)]
    
    if essay_test.shape[0] != 1:
        raise ValueError(f"Error id={essay_id} shape={essay_test.shape}")
        
    model = clone(best_model)
    model = model.fit(vectorizer.transform(essay_train.text), essay_train.generated)

    return model.predict_proba(vectorizer.transform(essay_test.text))


def csv_output(df: pd.DataFrame, filename: str) -> None:
    """
    Save a pandas DataFrame to a CSV file.

    :param df: The DataFrame to be saved.
    :type df: pd.DataFrame

    :param filename: The name of the CSV file.
    :type filename: str

    :return: This function does not return anything.
    :rtype: None
    """
    file = os.path.join(OUTPUT_DIR, filename)
    df.to_csv(file, encoding = 'utf8')
    print(f"File saved successfully as {file}")


def batch_validate(essay_id, file):
    try:
        # get rid of nested array
        res = validate(essay_id)[0]
    except Exception as e:
        print(e)
        return

    res_df = pd.DataFrame({"id": [essay_id], "proba1": [res[0]], "proba2": [res[1]]})
    
    # if no error append results to disk
    df = pd.read_csv(file).loc[:, ["id", "proba1", "proba2"]]
    new_df = pd.concat([df, res_df])
    new_df.to_csv(file, encoding="utf-8")

In [41]:
import random


file = os.path.join(INTERMEDIATE_DIR, "loov_intermediate_res.csv")

# create or overwrite empty file
try:
    previous_progress_df = pd.read_csv(file)
except FileNotFoundError:
    # create empty csv file
    pd.DataFrame(list()).to_csv(file)

ids = df[df.generated == 1].id
previous_ids = {str(id) for id in previous_progress_df.id}
new_ids = [id for id in ids if str(id) not in previous_ids]
random.shuffle(new_ids)

In [43]:
print("Running Leave One Out validation for generated texts...")
for id in tqdm(new_ids):
    batch_validate(id, file=file)

Running Leave One Out validation for generated texts...


  0%|          | 0/3701 [00:00<?, ?it/s]

KeyboardInterrupt: 