# Perplexity ratio score: Kullback–Leibler divergence

In [1]:
# Change working directory to parent so we can import as we would from main.py
%cd ..

# import pickle
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from IPython.display import Image

import functions.notebook_helper as helper_funcs
import functions.notebook_plotting as plot_funcs
import configuration as config

/mnt/arkk/llm_detector/classifier


Plan here is to take our sampling distributions of perplexity ratio (PR) scores for human and synthetic text and use them to generate a function that takes a perplexity ratio score and converts it into a Kullback-Leibler divergence (KLD) score. See the figure below from the [Wikipedia article on KLD](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).

Workflow is as follows:
1. Get kernel density estimate of PR score distribution for human and synthetic text fragments in training data.
2. Calculated KLD between the human and synthetic PR score distributions.
3. Get get kernel density estimate of KLD.
4. Use probability density function of KLD kernel density estimate to calculate KLD score for each text fragment in the training and testing data.
5. Add the KLD score as a new feature.

The above will be done individually for each fragment length bin and the combined data. This way the KLD score feature in each bin will capture the PR score distribution for text fragments in that specific length regime, rather that for the whole dataset.

In [2]:
Image(url = 'https://raw.githubusercontent.com/gperdrizet/llm_detector/benchmarking/benchmarking/notebooks/images/KL-Gauss-Example.png')

We need to build a set of functions we can call to generate and add the KLD score feature. Then we will apply them in a loop to each length bin. This should/could be parallelized over the bins.

## 1. Perplexity ratio score kernel density estimate

In [3]:
def pr_score_kde(data_df: pd.DataFrame, n_bins: int = 100) -> pd.DataFrame:
    '''Takes Pandas dataframe with 'Perplexity ratio score' text 'Source'
    features. Gets kernel density estimates of perplexity ratio score 
    distributions for human and synthetic text. Returns KDEs.'''

    # Get PR score density in the bins for human and synthetic scores separately
    human_scores = data_df['Perplexity ratio score'][data_df['Source'] == 'human']
    synthetic_scores = data_df['Perplexity ratio score'][data_df['Source'] == 'human']

    # Get KDEs
    human_pr_score_kde = gaussian_kde(human_scores)
    synthetic_pr_score_kde = gaussian_kde(synthetic_scores)

    return human_pr_score_kde, synthetic_pr_score_kde

## 2. Perplexity ratio score distribution Kullback-Leibler divergence

In [None]:
from math import log2

In [None]:
def kl_divergence(p, q):
    '''Takes two lists, calculates Kullback-Leibler divergence'''

    results = []

    for i, j in zip(p, q):
        if i > 0 and j > 0:
            results.append(i * log2(i/j))

        else:
            results.append(np.nan)

    return np.asarray(results)

In [None]:
def pr_score_kld(
        data_df: pd.DataFrame, 
        human_pr_score_kde, 
        synthetic_pr_score_kde,
        padding: float = 0.1,
        sample_frequency: float = 0.001
):
    '''Takes kernel density estimates of perplexity ratio score distributions for
    human and synthetic data and original dataset. Calculates Kullback-Leibler
    divergences of distributions at set regularly spaced sample points covering
    the original data's range plus some padding on either edge.'''

    # Get PR scores
    pr_scores = data_df['Perplexity ratio score']

    # Get a list of points covering the range of score values and extend
    # the left and right edges a little bit, otherwise the kernel density
    # estimate tends to droop at the edges of the range. We will clip
    # the padding off later.
    x = np.arange(
        min(pr_scores) - padding, 
        max(pr_scores) + padding, 
        sample_frequency
    ).tolist()

    # Get fitted values for the points
    human_fitted_values = human_pr_score_kde.pdf(x)
    synthetic_fitted_values = synthetic_pr_score_kde.pdf(x)

    # Calculate the KL divergences of the fitted values
    kld = kl_divergence(synthetic_fitted_values, human_fitted_values)

    # Get rid of any np.nan, without changing the length
    mask = np.isnan(kld)
    kld[mask] = 0

    # Get rid of any inf without changing the length
    mask = np.isinf(kld)
    kld[mask] = 0

    return kld

## x. Put it all together

Now we need to build a set of functions we can call to generate and add the KLD score feature. Then we will apply them in a loop to each length bin. This should/could be parallelized over the bins.

In [4]:
# The length bins
bins = {
    'combined': [0, np.inf],
    'bin_100': [1, 100],
    'bin_150': [51, 150],
    'bin_200': [101, 200],
    'bin_250': [151, 250],
    'bin_300': [201, 300],
    'bin_350': [251, 350],
    'bin_400': [301, 400],
    'bin_450': [351, 450],
    'bin_500': [401, 500],
    'bin_550': [451, 550],
    'bin_600': [501, 600]
}

In [7]:
# Reopen out hdf5 file with pandas so we can work with dataframes
data_lake = pd.HDFStore(config.LENGTH_BINNED_DATASET)

# Loop on the bins
for bin_id in bins.keys():

    # Pull the training features for this bin
    print(f'\nGetting training features for bin: {bin_id}')
    bin_training_features_df = data_lake[f'training/{bin_id}/features']

    # Calculate the PR score distribution kernel density estimates
    print(f'Calculating kernel density estimates for bin: {bin_id}')
    human_pr_score_kde, synthetic_pr_score_kde = pr_score_kde(bin_training_features_df)
    print(f'Kernel density estimates are type: {type(human_pr_score_kde)}')

    # Calculate the Kullback-Leibler divergences
    print(f'Calculating Kullback-Leibler divergence for bin: {bin_id}')
    pr_score_kld = pr_score_kld(
        bin_training_features_df, 
        human_pr_score_kde, 
        synthetic_pr_score_kde,
        padding = 0.1,
        sample_frequency = 0.001
    )
    print(f'Calculating Kullback-Leibler divergence is type: {type(human_pr_score_kde)}')

data_lake.close()


Getting training features for bin: combined
Calculating kernel density estimates for bin: combined

Getting training features for bin: bin_100
Calculating kernel density estimates for bin: bin_100

Getting training features for bin: bin_150
Calculating kernel density estimates for bin: bin_150

Getting training features for bin: bin_200
Calculating kernel density estimates for bin: bin_200

Getting training features for bin: bin_250
Calculating kernel density estimates for bin: bin_250

Getting training features for bin: bin_300
Calculating kernel density estimates for bin: bin_300

Getting training features for bin: bin_350
Calculating kernel density estimates for bin: bin_350

Getting training features for bin: bin_400
Calculating kernel density estimates for bin: bin_400

Getting training features for bin: bin_450
Calculating kernel density estimates for bin: bin_450

Getting training features for bin: bin_500
Calculating kernel density estimates for bin: bin_500

Getting training 