In [1]:
from tqdm.notebook import tqdm_notebook
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os


# enable progress bar functionality
tqdm_notebook().pandas()

DATA_DIR = os.path.join("..", "data")
OUTPUT_DIR = "..\output"

0it [00:00, ?it/s]

In [2]:
df = pd.read_csv(os.path.join(OUTPUT_DIR, "loov_input_data.csv"))
df = df.drop("Unnamed: 0", axis=1)
df

Unnamed: 0,id,text,prompt_id,generated,embedding
0,2003001870352,cars have been a major part of our lives for a...,0,1,[ 8.0638006e-02 1.6982207e+00 -2.5302460e+00 ...
1,2002999238416,"limiting car usage has many advantages, such a...",0,1,[-0.8870298 -0.45862535 -3.3064713 -0.091987...
2,2003043890224,"""america's love affair with it's vehicles seem...",0,1,[ 1.3898681 1.1529934 -1.213773 -2.872517...
3,2003002359856,"cars are convenient, but they can be harmful t...",0,1,[-0.90010715 -0.37471777 -3.003909 -0.017782...
4,2003032440576,"cars are a convenient way to get around, but t...",0,1,[-8.3860934e-02 -2.7152100e-01 -3.4049506e+00 ...
...,...,...,...,...,...
5248,fe6ff9a5,there has been a fuss about the elector colleg...,1,0,[-1.42343497e+00 5.88029288e-02 -2.23422146e+...
5249,ff669174,limiting car usage has many advantages. such a...,0,0,[-2.03082055e-01 8.64372015e-01 -3.53538036e+...
5250,ffa247e0,there's a new trend that has been developing f...,0,0,[-0.56401914 0.08619051 -1.9395874 -0.461474...
5251,ffc237e9,as we all know cars are a big part of our soci...,0,0,[-0.4084281 0.26007837 -2.2451966 -1.063174...


In [3]:
df.dtypes

id           object
text         object
prompt_id     int64
generated     int64
embedding    object
dtype: object

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


def get_data_from_text(vectorizer: TfidfVectorizer, 
                       data_train: pd.DataFrame, 
                       data_test: pd.DataFrame) -> dict[str, np.ndarray]:
    vectorizer = vectorizer.fit(data_train.text)
    x_train = vectorizer.transform(data_train.text)
    x_test = vectorizer.transform(data_test.text)
    
    embed_x_train = np.array([x for x in data_train.embedding])
    embed_x_test = np.array([x for x in data_test.embedding])
    
    y_train = data_train.generated.values
    y_test = data_test.generated.values

    return {"x_train": x_train, 
            "y_train": y_train,
            "x_test": x_test, 
            "y_test": y_test, 
            "embed_x_train": embed_x_train, 
            "embed_x_test": embed_x_test}

In [5]:
from sklearn.model_selection import train_test_split


data_train, data_test = train_test_split(df,
                                         train_size=0.7, 
                                         test_size=0.3, 
                                         stratify=df.generated,
                                         random_state=42)

#https://aclanthology.org/2020.aespen-1.6.pdf
vectorizer = TfidfVectorizer(strip_accents="unicode",
                             ngram_range=(3,5), 
                             max_df=0.9, 
                             min_df=0.05)
data = get_data_from_text(vectorizer, data_train, data_test)

x_train = data["x_train"]
x_test = data["x_test"]

embed_x_train = data["embed_x_train"]
embed_x_test = data["embed_x_test"]

y_train = data["y_train"]
y_test = data["y_test"]

In [6]:
from sklearn.ensemble import AdaBoostClassifier


best_model = AdaBoostClassifier(n_estimators=200)

In [7]:
from sklearn.base import clone


def validate(essay_id):
    essay_train = df[~df.id.eq(essay_id)] 
    essay_test = df[df.id.eq(essay_id)]
    
    if essay_test.shape[0] != 1:
        print(f"Error id={essay_id} shape={essay_test.shape}")
        return None
        
    model = clone(best_model)
    model = model.fit(vectorizer.transform(essay_train.text), essay_train.generated)
    return model.predict_proba(vectorizer.transform(essay_test))

In [None]:
from concurrent.futures import ThreadPoolExecutor


probs = []
ids = df[df.generated == 1].id
# n_jobs are optimized for my machine based on CPU usage, 
# change it accordingly
n_jobs = 5

print("Running Leave One Out validation for generated texts...")
with ThreadPoolExecutor(max_workers=n_jobs) as executor:
    probs = list(tqdm(executor.map(validate, ids), total=len(ids)))

Running Leave One Out validation for generated texts...


  0%|          | 0/3878 [00:00<?, ?it/s]

In [None]:
similarity_df["probs"] = [x[0][0] for x in probs]
similarity_df