In [1]:
from tqdm.notebook import tqdm_notebook
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os
import notebook_config


# enable progress bar functionality
tqdm_notebook().pandas()


INTERMEDIATE_DIR = os.path.join("..", notebook_config.INTERMEDIATE_DIR)

0it [00:00, ?it/s]

In [2]:
df = pd.read_csv(os.path.join(INTERMEDIATE_DIR, notebook_config.LOOV_INPUT_NAME))
df = df.drop("Unnamed: 0", axis=1)
df

Unnamed: 0,id,text,prompt_id,generated,llm,source,embedding
0,2399651113280,Cars have been a major part of our lives for a...,0,1,PaLM,Konstantina Liagkou,[ 4.3449631e-01 1.2764874e+00 -2.2267876e+00 ...
1,2399637356832,"Limiting car usage has many advantages, such a...",0,1,PaLM,Konstantina Liagkou,[-0.9719449 -0.68819636 -3.2451413 -0.070603...
2,2399609023088,"""America's love affair with it's vehicles seem...",0,1,PaLM,Konstantina Liagkou,[ 1.02260339e+00 1.06476414e+00 -1.46690214e+...
3,2399651627056,"Cars are convenient, but they can be harmful t...",0,1,PaLM,Konstantina Liagkou,[-1.1270489e+00 -1.0655222e+00 -2.8863614e+00 ...
4,2399605263872,"Cars are a convenient way to get around, but t...",0,1,PaLM,Konstantina Liagkou,[-0.27686608 -0.5261516 -3.4441886 0.026801...
...,...,...,...,...,...,...,...
5248,fe6ff9a5,There has been a fuss about the Elector Colleg...,1,0,Human,Competition,[-1.3147941 -0.10259935 -2.1935065 0.314474...
5249,ff669174,Limiting car usage has many advantages. Such a...,0,0,Human,Competition,[-3.51573974e-01 8.29198062e-01 -3.43038702e+...
5250,ffa247e0,There's a new trend that has been developing f...,0,0,Human,Competition,[-0.8698616 -0.04979787 -1.9698174 -0.281999...
5251,ffc237e9,As we all know cars are a big part of our soci...,0,0,Human,Competition,[-8.27753067e-01 -1.23205185e-01 -2.32754445e+...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


#https://aclanthology.org/2020.aespen-1.6.pdf
vectorizer = TfidfVectorizer(strip_accents="unicode",
                             ngram_range=(3,5), 
                             max_df=0.9, 
                             min_df=0.05)
vectorizer.fit(df.text)

In [5]:
import skops.io as sio


file = os.path.join(INTERMEDIATE_DIR, notebook_config.MODEL_FILE_NAME)
best_model = sio.load(file, trusted=True)
best_model

In [30]:
from sklearn.base import clone


def validate(essay_id):
    essay_train = df[~df.id.eq(essay_id)]
    essay_test = df[df.id.eq(essay_id)]
    
    if essay_test.shape[0] != 1:
        raise ValueError(f"Error id={essay_id} shape={essay_test.shape}")
        
    model = clone(best_model)
    model = model.fit(vectorizer.transform(essay_train.text), essay_train.generated)

    return model.predict_proba(vectorizer.transform(essay_test.text))


def csv_output(df: pd.DataFrame, filename: str) -> None:
    """
    Save a pandas DataFrame to a CSV file.

    :param df: The DataFrame to be saved.
    :type df: pd.DataFrame

    :param filename: The name of the CSV file.
    :type filename: str

    :return: This function does not return anything.
    :rtype: None
    """
    file = os.path.join(OUTPUT_DIR, filename)
    df.to_csv(file, encoding = 'utf8')
    print(f"File saved successfully as {file}")


def batch_validate(essay_id, file):
    try:
        # get rid of nested array
        res = validate(essay_id)[0]
    except Exception as e:
        print(e)
        return

    res_df = pd.DataFrame({"id": [essay_id], "text": [df[essay_id].text] , "proba": [res[1]]})
    
    # if no error append results to disk
    df = pd.read_csv(file).loc[:, ["id", "text", "proba"]]
    new_df = pd.concat([df, res_df])
    new_df.to_csv(file, encoding="utf-8")

In [41]:
import random


file = os.path.join(INTERMEDIATE_DIR, notebook_config.LOOV_RES_NAME)

# create or overwrite empty file
try:
    previous_progress_df = pd.read_csv(file)
except FileNotFoundError:
    # create empty csv file
    pd.DataFrame(list()).to_csv(file)

ids = df[df.generated == 1].id
previous_ids = {str(id) for id in previous_progress_df.id}
new_ids = [id for id in ids if str(id) not in previous_ids]
random.shuffle(new_ids)

In [43]:
print("Running Leave One Out validation for generated texts...")
for id in tqdm(new_ids):
    batch_validate(id, file=file)

Running Leave One Out validation for generated texts...


  0%|          | 0/3701 [00:00<?, ?it/s]

KeyboardInterrupt: 