# TF-IDF score: Kullback–Leibler divergence

In [1]:
# Change working directory to parent so we can import as we would from main.py
%cd ..

from __future__ import annotations

import gc
import re
import nltk
import h5py
import numpy as np
import pandas as pd
import multiprocessing as mp

from IPython.display import Image
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import configuration as config
import functions.kullback_leibler_divergence as kld_funcs

/mnt/arkk/llm_detector/classifier


Plan here is to take our sampling distributions of text frequency, inverse document frequency (TF-IDF) scores for human and synthetic text and use them to generate a function that takes a TF-IDF score and converts it into a Kullback-Leibler divergence (KLD) score. See the figure below from the [Wikipedia article on KLD](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).

Workflow is as follows:
1. Get kernel density estimate of TF-IDF score distribution for human and synthetic text fragments in training data.
2. Calculated KLD between the human and synthetic TF-IDF score distributions.
3. Get get kernel density estimate of KLD.
4. Use probability density function of KLD kernel density estimate to calculate KLD score for each text fragment in the training and testing data.
5. Add the KLD score as a new feature.

The above will be done individually for each fragment length bin and the combined data. This way the KLD score feature in each bin will capture the TF-IDF score distribution for text fragments in that specific length regime, rather that for the whole dataset.

In [2]:
Image(url = 'https://raw.githubusercontent.com/gperdrizet/llm_detector/benchmarking/benchmarking/notebooks/images/KL-Gauss-Example.png')

## 1. TF-IDF score
Before calculating a Kullback-Leibler divergence score for the TF-IDF score, we need to calculate the TF-IDF score itself for each fragment.

The TF-IDF score created for this project involves scoring each text fragment with TF-IDF term frequencies derived from the human and synthetic text fragments in the training data. The TF-IDF score is a product normalized difference calculated as:

$$ (human - synthetic)(human + synthetic) $$

Where human and synthetic refer to average TF-IDF by term for a given text fragment where the term TF-IDF values were derived from the human or synthetic text in the training dataset.

## 1.1. Get human and synthetic text strings

In [3]:
# Get and set up stop words and an instance of the Word Net
# Lemmatizer for use in cleaning text for vectorization
nltk.download('stopwords', quiet = True)
nltk.download('wordnet', quiet = True)
stop_words = stopwords.words('english')

sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def clean_text(text: str = None) -> str:
    '''Takes a text string and cleans it for vectorization.
    Returns cleaned text as string.'''
    
    # Lowercase everything
    text = text.lower()

    # Replace everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)

    # Remove URLs 
    text = re.sub(r"http\S+", "",text)
    
    # Remove html tags
    html = re.compile(r'<.*?>') 
    text = html.sub(r'',text)
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'

    # Remove punctuations
    for p in punctuations:
        text = text.replace(p,'')
        
    # Remove stopwords
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    
    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text)
    
    return text

In [4]:
def get_text(bin_training_data_df: pd.DataFrame) ->tuple[pd.Series, pd.Series]:
    '''Gets and cleans human and synthetic text from training data.'''

    # If we have more than 10,000 text fragments, take a random sample of 10,000
    # to keep memory utilization under control during vectorization, then get the
    # text strings for the human and synthetic text fragments in the sample
    if len(bin_training_data_df) > 10000:
    
        training_data_df_sample = bin_training_data_df.sample(n = 10000, random_state = 42)
        training_data_df_sample.reset_index(inplace = True, drop = True)

        human_texts = training_data_df_sample['String'][training_data_df_sample['Source'] == 'human']
        synthetic_texts = training_data_df_sample['String'][training_data_df_sample['Source'] == 'synthetic']

    # If the dataset has 10,000 or less text fragments, directly pull all of the
    # text fragment strings for human and synthetic fragments from the data
    else:
        
        human_texts = bin_training_data_df['String'][bin_training_data_df['Source'] == 'human']
        synthetic_texts = bin_training_data_df['String'][bin_training_data_df['Source'] == 'synthetic']

    # Clean text for vectorization
    human_texts = human_texts.apply(lambda x: clean_text(x))
    synthetic_texts = synthetic_texts.apply(lambda x: clean_text(x))

    return human_texts, synthetic_texts

## 1.2. Get term TF-IDF values for human and synthetic text from training data

In [5]:
def get_term_tf_idf(human_texts: pd.Series, synthetic_texts: pd.Series) -> dict:
    '''Takes cleaned human and synthetic text as Pandas series, gets term TF-IDF values
    for each and returns as dictionary of look-up tables where key is term feature and
    value is term TF-IDF.'''

    # Dictionary to hold TF-IDF look-up tables
    tfidf_luts = {}

    # Loop twice to process the human and synthetic texts the same way
    for text_source, texts in zip(['human', 'synthetic'], [human_texts, synthetic_texts]):

        # Fit the TF-IDF vectorizer
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_vectors = tfidf_vectorizer.fit_transform(texts)

        # Convert the vectors to numpy and replace zeros with NAN
        tfidf = tfidf_vectors.toarray()
        tfidf[tfidf == 0] = np.nan

        # Take the log2 and average the columns (i.e. get average TF-IDF per word)
        log_tfidf = np.log2(tfidf)
        log_tfidf_mean = np.nanmean(log_tfidf, axis = 0)

        # Get the words
        features = tfidf_vectorizer.get_feature_names_out()

        # Release some memory
        del tfidf_vectorizer
        del tfidf_vectors
        _ = gc.collect()

        # Add result to look-up table
        tfidf_luts[text_source] = dict(zip(features, log_tfidf_mean))

    return tfidf_luts

## 1.3. Score each text fragment

In [6]:
def tf_idf_score_text_fragments(data_df: pd.DataFrame, tfidf_luts: dict = None) -> dict:
    '''Takes features dataframe and dictionary containing term TF-IDF look-up tables.
    scores text fragments from dataframe with product normalized difference in log2 TF-IDF mean.
    Adds TF-IDF score and log2 TF-IDF mean'''

    # Holders for new features
    tfidf_scores = []
    human_tfidf = []
    synthetic_tfidf = []

    # Get the text fragments
    texts = data_df['String']

    # Loop on dataframe rows
    for text in texts:

        # Clean the text
        text = clean_text(text)

        # Split the text into words
        words = text.split(' ')

        # Score the words using the human and synthetic luts only scoring words for which we have
        # a human and a synthetic TF-IDF value
        scored_word_count = 0
        human_tfidf_sum = 0
        synthetic_tfidf_sum = 0

        for word in words:

            if word in tfidf_luts['human'].keys() and word in tfidf_luts['synthetic'].keys():
                human_tfidf_sum += tfidf_luts['human'][word]
                synthetic_tfidf_sum += tfidf_luts['synthetic'][word]

                scored_word_count += 1

        # Get the means, protecting from division by zero
        if scored_word_count == 0:

            human_tfidf_mean = 0
            synthetic_tfidf_mean = 0

        elif scored_word_count != 0:
            
            human_tfidf_mean = human_tfidf_sum / scored_word_count
            synthetic_tfidf_mean = synthetic_tfidf_sum / scored_word_count

        # Get the product normalized TF-IDF score
        dmean_tfidf = human_tfidf_mean - synthetic_tfidf_mean
        product_normalized_dmean_tfidf = dmean_tfidf * (human_tfidf_mean + synthetic_tfidf_mean)

        # Add to results
        human_tfidf.append(human_tfidf_mean)
        synthetic_tfidf.append(synthetic_tfidf_mean)
        tfidf_scores.append(product_normalized_dmean_tfidf)

    # Add new feature back to dataframe
    data_df['Human TF-IDF'] = human_tfidf
    data_df['Synthetic TF-IDF'] = synthetic_tfidf
    data_df['TF-IDF score'] = tfidf_scores

    return data_df


## 1.4. Add TF-IDF score to data in length bin

In [7]:
def add_tf_idf_score(
        bin_training_data_df: pd.DataFrame, 
        bin_testing_data_df: pd.DataFrame,
        worker_num: str,
        bin_id: int
) -> tuple[pd.DataFrame, pd.DataFrame]:
    
    '''Takes training and testing datasets in dataframes. Uses training
    data to calculate term TF-IDF for human and synthetic data. Uses those
    term TF-IDF values to calculate product normalized TF-IDF score for
    each text fragment in the training and testing data. Adds TF-IDF score
    to dataframes as new features and return the updated dataframes.'''

    try:
        human_texts, synthetic_texts = get_text(bin_training_data_df)

    except Exception as err_string:
        print(f'\nWorker {worker_num} - get_text() error: {err_string}', end = '')

    try:
        tfidf_luts = get_term_tf_idf(human_texts, synthetic_texts)

    except Exception as err_string:
        print(f'\nWorker {worker_num} - get_term_tf_idf() error: {err_string}', end = '')

    try:
        bin_training_data_df = tf_idf_score_text_fragments(bin_training_data_df, tfidf_luts)
        bin_testing_data_df = tf_idf_score_text_fragments(bin_testing_data_df, tfidf_luts)

    except Exception as err_string:
        print(f'\nWorker {worker_num} - tf_idf_score_text_fragments() error: {err_string}', end = '')

    return bin_id, bin_training_data_df, bin_testing_data_df

## 1.5. Bring it all together
Now, let's parallelize the calculation over the length bins in the dataset.

In [8]:
def tf_idf_score(
        hdf5_file: str,
        score_sample: bool = False,
) -> None:

    '''Main function to parallelize computation of TF-IDF score
    over length bins.'''

    # Get the bins from the hdf5 file's metadata
    data_lake = h5py.File(hdf5_file, 'r')
    bins = dict(data_lake.attrs.items())
    data_lake.close()

    # Calculate worker number whichever is less, the number of avalible
    # CPU or the humber of bins
    n_workers = min(20, len(list(bins.keys())))

    # Instantiate worker pool
    pool = mp.Pool(
        processes = n_workers,
        maxtasksperchild = 1
    )

    # Holder for returns from workers
    async_results = []

    # Open a connection to the hdf5 dataset via PyTables with Pandas
    data_lake = pd.HDFStore(hdf5_file)

    # Loop on the bins
    for worker_num, bin_id in enumerate(bins.keys()):

        # Pull the training features for this bin
        bin_training_features_df = data_lake[f'training/{bin_id}/features']
        print(f'\nWorker {worker_num} - {len(bin_training_features_df)} fragments in {bin_id}', end = '')

        # Pull the testing features for this bin
        bin_testing_features_df = data_lake[f'testing/{bin_id}/features']

        # Take sample if desired
        if score_sample is True:
            bin_training_features_df = bin_training_features_df.sample(frac = 0.1)
            bin_testing_features_df = bin_testing_features_df.sample(frac = 0.1)

        async_results.append(
            pool.apply_async(add_tf_idf_score,
                args = (
                    bin_training_features_df,
                    bin_testing_features_df,
                    worker_num,
                    bin_id
                )
            )
        )

    # Clean up
    pool.close()
    pool.join()

    ##### Collect and save the results #########################################

    # Get the results
    new_results = [async_result.get() for async_result in async_results]

    # Add the new results
    for new_result in new_results:

        # Parse the result
        bin_id = new_result[0]
        training_features_df = new_result[1]
        testing_features_df = new_result[2]

        # Print info for sanity check
        print(f'\n\n{bin_id} training features:\n')
        training_features_df.info()

        # Put data back into hdf5
        data_lake.put(f'training/{bin_id}/features', training_features_df)
        data_lake.put(f'testing/{bin_id}/features', testing_features_df)

    data_lake.close()

In [9]:
# The dataset we want to bin - omit the file extension, it will be 
# added appropriately for the input and output files
dataset_name = 'falcon-7b_scores_v2_10-1000_words'

# Input file path
input_file = f'{config.DATA_PATH}/{dataset_name}.h5'

# Option to sample 10% of the data for rapid testing and development
sample = False

# Run the Kullback-Leibler score calculation on the TF-IDF score
tf_idf_score(
        hdf5_file = input_file,
        score_sample = True # Run on 10% of data for rapid development
)


Worker 0 - 8532 fragments in bin_100
Worker 1 - 8081 fragments in bin_150
Worker 2 - 6995 fragments in bin_200
Worker 3 - 5999 fragments in bin_250
Worker 4 - 5193 fragments in bin_300
Worker 5 - 4169 fragments in bin_350
Worker 6 - 2571 fragments in bin_400
Worker 7 - 1146 fragments in bin_450
Worker 8 - 435 fragments in bin_500
Worker 9 - 325 fragments in bin_600
Cleaned text is: <class 'pandas.core.series.Series'>
Worker 10 - 23937 fragments in combined
Cleaned text is: <class 'pandas.core.series.Series'>
TF-IDF look-up table is: <class 'dict'>
Cleaned text is: <class 'pandas.core.series.Series'>
TF-IDF look-up table is: <class 'dict'>
Cleaned text is: <class 'pandas.core.series.Series'>
TF-IDF look-up table is: <class 'dict'>
Cleaned text is: <class 'pandas.core.series.Series'>
Cleaned text is: <class 'pandas.core.series.Series'>
Cleaned text is: <class 'pandas.core.series.Series'>
Cleaned text is: <class 'pandas.core.series.Series'>
TF-IDF look-up table is: <class 'dict'>
TF-IDF 

## 1.6. TF-IDF Kullback-Leibler divergence score

In [10]:
# Run the Kullback-Leibler score calculation on the TF-IDF score
kld_funcs.kullback_leibler_score(
        feature_name = 'TF-IDF score',
        hdf5_file = input_file,
        padding = 0.1,
        sample_frequency = 0.001,
        score_sample = sample
)


Worker 0 - 853 fragments in bin_100
Worker 1 - 808 fragments in bin_150
Worker 2 - 700 fragments in bin_200
Worker 3 - 600 fragments in bin_250
Worker 4 - 519 fragments in bin_300
Worker 0 - adding Kullback-Leibler score to training features
Worker 5 - 417 fragments in bin_350
Worker 6 - 257 fragments in bin_400
Worker 2 - adding Kullback-Leibler score to training features
Worker 7 - 115 fragments in bin_450
Worker 8 - 44 fragments in bin_500
Worker 9 - 32 fragments in bin_600
Worker 1 - adding Kullback-Leibler score to training features
Worker 5 - adding Kullback-Leibler score to training features
Worker 10 - 2394 fragments in combined
Worker 3 - adding Kullback-Leibler score to training features
Worker 6 - adding Kullback-Leibler score to training features
Worker 4 - adding Kullback-Leibler score to training features
Worker 7 - adding Kullback-Leibler score to training features
Worker 8 - adding Kullback-Leibler score to training features
Worker 9 - adding Kullback-Leibler score to 