# Perplexity ratio score: Kullback–Leibler divergence

In [1]:
# Change working directory to parent so we can import as we would from main.py
%cd ..

from __future__ import annotations

# import pickle
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from IPython.display import Image

# import functions.notebook_helper as helper_funcs
# import functions.notebook_plotting as plot_funcs
import configuration as config

/mnt/arkk/llm_detector/classifier


Plan here is to take our sampling distributions of perplexity ratio (PR) scores for human and synthetic text and use them to generate a function that takes a perplexity ratio score and converts it into a Kullback-Leibler divergence (KLD) score. See the figure below from the [Wikipedia article on KLD](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).

Workflow is as follows:
1. Get kernel density estimate of PR score distribution for human and synthetic text fragments in training data.
2. Calculated KLD between the human and synthetic PR score distributions.
3. Get get kernel density estimate of KLD.
4. Use probability density function of KLD kernel density estimate to calculate KLD score for each text fragment in the training and testing data.
5. Add the KLD score as a new feature.

The above will be done individually for each fragment length bin and the combined data. This way the KLD score feature in each bin will capture the PR score distribution for text fragments in that specific length regime, rather that for the whole dataset.

In [2]:
Image(url = 'https://raw.githubusercontent.com/gperdrizet/llm_detector/benchmarking/benchmarking/notebooks/images/KL-Gauss-Example.png')

We need to build a set of functions we can call to generate and add the KLD score feature. Then we will apply them in a loop to each length bin. This should/could be parallelized over the bins.

## 1. Perplexity ratio score kernel density estimate

In [3]:
def get_pr_score_kdes(data_df: pd.DataFrame) -> tuple[gaussian_kde, gaussian_kde]:
    '''Takes Pandas dataframe with 'Perplexity ratio score' text 'Source'
    features. Gets kernel density estimates of perplexity ratio score 
    distributions for human and synthetic text. Returns KDEs.'''

    # Get PR score density in the bins for human and synthetic scores separately
    human_scores = data_df['Perplexity ratio score'][data_df['Source'] == 'human']
    synthetic_scores = data_df['Perplexity ratio score'][data_df['Source'] == 'synthetic']

    # Get KDEs
    human_pr_score_kde = gaussian_kde(human_scores)
    synthetic_pr_score_kde = gaussian_kde(synthetic_scores)

    return human_pr_score_kde, synthetic_pr_score_kde

## 2. Perplexity ratio score distribution Kullback-Leibler divergence

In [4]:
from math import log2

In [5]:
def kl_divergence(p: list, q: list) -> np.ndarray:
    '''Takes two lists, calculates Kullback-Leibler divergence'''

    # Holder for results
    results = []

    # Loop on lists of values
    for i, j in zip(p, q):

        # Check for zeros
        if i > 0 and j > 0:

            # Add KLD to results
            kld_value = i * log2(i/j)
            results.append(kld_value)

        # Add np.nan for cases where we have zeros
        else:
            results.append(np.nan)
            
    # Return the result as numpy array
    return np.asarray(results)

In [6]:
def get_pr_score_kld(
        data_df: pd.DataFrame, 
        human_pr_score_kde: gaussian_kde, 
        synthetic_pr_score_kde: gaussian_kde,
        padding: float = 0.1,
        sample_frequency: float = 0.001
) -> tuple[np.ndarray, np.ndarray]:
    
    '''Takes kernel density estimates of perplexity ratio score distributions for
    human and synthetic data and original dataset. Calculates Kullback-Leibler
    divergences of distributions at set of regularly spaced sample points covering
    the original data's range plus some padding on either edge. Returns the 
    Kullback-Leibler divergence values and the sample points used to calculate them.'''

    # Get PR scores
    pr_scores = data_df['Perplexity ratio score']

    # Get a list of points covering the range of score values and extend
    # the left and right edges a little bit, otherwise the kernel density
    # estimate tends to droop at the edges of the range. We will clip
    # the padding off later.
    x = np.arange(
        min(pr_scores) - padding, 
        max(pr_scores) + padding, 
        sample_frequency
    )

    # Get fitted values for the points
    human_fitted_values = human_pr_score_kde.pdf(x)
    synthetic_fitted_values = synthetic_pr_score_kde.pdf(x)
    print(f'  Human fitted values: {human_fitted_values[3]}')
    print(f'  Human fitted values range: {min(human_fitted_values), max(human_fitted_values)}')
    print(f'  Synthetic fitted values: {synthetic_fitted_values[:3]}')
    print(f'  Synthetic fitted values range: {min(synthetic_fitted_values), max(synthetic_fitted_values)}')

    # Calculate the KL divergences of the fitted values
    kld = kl_divergence(synthetic_fitted_values, human_fitted_values)
    print(f'  Raw KLD values: {kld[:3]}')
    print(f'  Raw KLD range: {min(kld), max(kld)}')

    # Get rid of any np.nan, without changing the length
    mask = np.isnan(kld)
    kld[mask] = 0

    # Get rid of any inf without changing the length
    mask = np.isinf(kld)
    kld[mask] = 0

    print(f'  NAN/INF filtered KLD values: {kld[:3]}\n')

    return kld, x

## 3. Perplexity ratio score distribution Kullback-Leibler divergence

In [7]:
def get_kld_kde(kld: np.ndarray, x: np.ndarray) -> gaussian_kde:
    '''Takes list of Kullback-Leibler divergence values, and regularly
    spaced sample points taken from original data's range used to generate 
    them. Generates and returns gaussian kernel density estimate. Trick 
    here is that the KLD values are 'density' as they are derived from 
    the KDEs of the PR score distributions. Therefore they need to be 
    converted back to 'raw' data.'''

    # Convert the KLD 'density' values into integer 'count' values
    print(f'  KLD values: {kld[:3]}')

    # Shift the kld values so that they are non-negative
    kld = kld + abs(min(kld))

    # Then scale the values so when we convert to integer we get good
    # resolution, e.g. we don't want to collapse 2.1, 2.2, 2.3 etc.,
    # to 2. Instead, 2100.0, 2200.0, 2300.0 become 2100, 2200, 2300 etc.
    kld = kld * 1000

    # Convert to integer
    kld_counts = kld.astype(int)
    print(f'  KLD counts: {kld_counts[:3]}')

    # Now, construct a list where each value of x appears a number of times
    # equal to it's KLD 'count'
    kld_scores = []

    for i in range(len(kld_counts)):
        kld_scores.extend([x[i]] * kld_counts[i])

    print(f'  KLD scores: {kld_scores[:3]}\n')

    # Then, run a KDE on the reconstructed KLD scores
    kld_kde = gaussian_kde(kld_scores)

    return kld_kde

## x. Put it all together

Now we need to build a set of functions we can call to generate and add the KLD score feature. Then we will apply them in a loop to each length bin. This should/could be parallelized over the bins.

In [8]:
# The length bins
bins = {
    'combined': [0, np.inf],
    'bin_100': [1, 100],
    'bin_150': [51, 150],
    'bin_200': [101, 200],
    'bin_250': [151, 250],
    'bin_300': [201, 300],
    'bin_350': [251, 350],
    'bin_400': [301, 400],
    'bin_450': [351, 450],
    'bin_500': [401, 500],
    'bin_550': [451, 550],
    'bin_600': [501, 600]
}

In [9]:
# Reopen out hdf5 file with pandas so we can work with dataframes
data_lake = pd.HDFStore(config.LENGTH_BINNED_DATASET)

# Loop on the bins
for bin_id in bins.keys():

    # Pull the training features for this bin
    print(f'\nGetting training features for bin: {bin_id}')
    bin_training_features_df = data_lake[f'training/{bin_id}/features']
    print(f' Training features are type: {type(bin_training_features_df)}')

    # Calculate the PR score distribution kernel density estimates
    print('\n Calculating kernel density estimates')
    human_pr_score_kde, synthetic_pr_score_kde = get_pr_score_kdes(bin_training_features_df)
    print(f' Kernel density estimates are type: {type(human_pr_score_kde)}')

    # Calculate the Kullback-Leibler divergence
    print('\n Calculating Kullback-Leibler divergence')
    pr_score_kld, x = get_pr_score_kld(
        bin_training_features_df, 
        human_pr_score_kde, 
        synthetic_pr_score_kde,
        padding = 0.1,
        sample_frequency = 0.001
    )
    print(f' Kullback-Leibler divergence is type: {type(pr_score_kld)}')

    # Get kernel density estimate of Kullback-Leibler divergence
    print('\n Calculating Kullback-Leibler kernel density estimate')
    kld_kde = get_kld_kde(pr_score_kld, x)
    print(f' Kullback-Leibler divergence kernel density estimate is type: {type(kld_kde)}')

data_lake.close()


Getting training features for bin: combined
 Training features are type: <class 'pandas.core.frame.DataFrame'>

 Calculating kernel density estimates
 Kernel density estimates are type: <class 'scipy.stats._kde.gaussian_kde'>

 Calculating Kullback-Leibler divergence
  Human fitted values: 1.3373083605512705e-246
  Human fitted values range: (8.51732283820921e-251, 7.076898076747471)
  Synthetic fitted values: [1.08554668e-23 2.77509003e-23 7.02762987e-23]
  Synthetic fitted values range: (2.1028846232550933e-165, 7.231800928324723)
  Raw KLD values: [8.18966407e-21 2.08443204e-20 5.25537845e-20]
  Raw KLD range: (-3.4733882330304398, 25.892866567424857)
  NAN/INF filtered KLD values: [8.18966407e-21 2.08443204e-20 5.25537845e-20]

 Kullback-Leibler divergence is type: <class 'numpy.ndarray'>

 Calculating Kullback-Leibler kernel density estimate
  KLD values: [8.18966407e-21 2.08443204e-20 5.25537845e-20]
  KLD counts: [3473 3473 3473]
  KLD scores: [0.1590156, 0.1590156, 0.1590156]


  kld_value = i * log2(i/j)


 Kullback-Leibler divergence kernel density estimate is type: <class 'scipy.stats._kde.gaussian_kde'>

Getting training features for bin: bin_350
 Training features are type: <class 'pandas.core.frame.DataFrame'>

 Calculating kernel density estimates
 Kernel density estimates are type: <class 'scipy.stats._kde.gaussian_kde'>

 Calculating Kullback-Leibler divergence
  Human fitted values: 2.055268881338936e-115
  Human fitted values range: (7.357601476572686e-119, 10.23236415621304)
  Synthetic fitted values: [1.83598253e-30 6.64365075e-30 2.37318499e-29]
  Synthetic fitted values range: (3.581326042091337e-83, 10.192676061116998)
  Raw KLD values: [5.39134283e-28 1.93774900e-27 6.87488307e-27]
  Raw KLD range: (-2.707458628031779, 73.63025128815259)
  NAN/INF filtered KLD values: [5.39134283e-28 1.93774900e-27 6.87488307e-27]

 Kullback-Leibler divergence is type: <class 'numpy.ndarray'>

 Calculating Kullback-Leibler kernel density estimate
  KLD values: [5.39134283e-28 1.93774900e-