<a href="https://colab.research.google.com/github/ephipie/human-ai-parallel-detection/blob/main/LLM_Detection_03_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import paired_cosine_distances

import openai
import random
import math
from google.colab import userdata
from tenacity import retry, stop_after_attempt, wait_exponential
import os



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
working_df = pd.read_parquet("/content/drive/MyDrive/shared_data/llm_detection_data_embeds.parquet")

In [None]:
working_df.columns

In [None]:
working_df.head()

## Embedding Similarity Based Predictions

In [None]:
%%time

def add_similarity_preds(df):
  df =   df.copy()
  ORIG_CHUNK_1 = 'chunk_1'
  ORIG_CHUNK_2 = 'chunk_2'
  MODELS = ['gpt','llama']
  base_emb = np.array(df[f"{ORIG_CHUNK_1}_embeddings"].tolist())
  for candidate in (ORIG_CHUNK_2, *MODELS):
    candidate_emb = np.array(df[f"{candidate}_embeddings"].tolist())
    df[f"{candidate}_sim"] = 1 - paired_cosine_distances(base_emb, candidate_emb)
  for model in MODELS:
    # Is the similarity(original chunk1, original chunk2) >  similarity(original chunk1, model generated chunk2)
    # i.e. is the style embeddings able to identify orginal chunk?
    df[f"{ORIG_CHUNK_2}_vs_{model}"] = np.where(df[f"{ORIG_CHUNK_2}_sim"] > df[f"{model}_sim"], 1, 0)
  return df

enriched_df = add_similarity_preds(working_df)

In [None]:
enriched_df[['domain','chunk_2_vs_gpt','chunk_2_vs_llama']].groupby('domain').mean()

## LLM as a Judge Predictions

In [None]:

def prompt_original_vs_llm(T1: str, A: str, B: str) -> str:
    """
    Returns a complete prompt asking the model to identify which continuation
    (A or B) is the genuine author-written follow-up to the source text T1.
    """
    return f"""You are an expert at detecting the style of written text
And identifying whether a text is written by a human or an AI language model.

You are given an original text T1 and two possible continuations A and B.
One of these is written by the original author of T1 and the other is generated
by an LLM instructed to continue T1 in the same style.

Your task is to decide which of two continuations (A or B) is the genuine
author-written follow-up to a source text (T1).

Input
=====
T1:
{T1}

Continuation A:
{A}

Continuation B:
{B}


Answer with a single letter A or B.

"""


@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(
        multiplier=1,    # Base multiplier (delay starts at 1 second)
        min=1,          # Minimum delay between retries
        max=8           # Maximum delay between retries
    ),
    reraise=True  # Re-raise the exception if all retries fail
)
def _query_llm_with_logprobs(
    prompt: str,
    *,
    client: openai.Client,
    model: str,
) -> tuple[str, float]:
    """
    Returns ( model_choice_letter ,  P(A) ), where P(A) is the
    soft-max-normalised probability that the LLM assigns to token "A".
    """

    # print(prompt)
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": "You are a strict evaluator. Reply with a single letter: A or B."},
            {"role": "user", "content": prompt},
        ],
        temperature=0,
        max_tokens=1,
        logprobs=True,
        top_logprobs=2,           # returns top-2 logprobs (A & B)
    )

    top_lp = resp.choices[0].logprobs.content[0].top_logprobs
    lp_dict = {tok.token.strip(): tok.logprob for tok in top_lp}

    lp_A = lp_dict.get("A", -math.inf)
    lp_B = lp_dict.get("B", -math.inf)
    pA   = math.exp(lp_A) / (math.exp(lp_A) + math.exp(lp_B))

    choice = resp.choices[0].message.content.strip().upper()  # "A" or "B"
    return choice, pA


# ──────────────────────────────────────────────────────────
# 2.  main entry - adds judge columns to DataFrame
# ──────────────────────────────────────────────────────────
def add_llm_judge_predictions(
    *,
    df: pd.DataFrame,
    client: openai.Client,
    rival_models: list[str],
    judge_model: str,
    base_col: str = "chunk_1",
    seed: int = 42,
) -> pd.DataFrame:
    """
    For every row: ask LLM which continuation (chunk_2 vs each rival
    text column) is the better follow-up to base_col.

    Make a copy of the df, enrich and return.
    """
    df = df.copy()
    # determine rivals automatically (everything except base & chunk_2)
    rnd = random.Random(seed)
    for rival in rival_models:
        print(rival)
        out_choice = f"llm_chunk2_vs_{rival}"
        out_prob   = f"{out_choice}_prob"
        df[out_choice] = ""
        df[out_prob]   = np.nan

        i = 0
        for idx, row in df.iterrows():
            i+=1
            pair = [("chunk_2", row["chunk_2"]), (rival, row[rival])]
            rnd.shuffle(pair)
            labels = {"A": pair[0], "B": pair[1]}

            prompt = prompt_original_vs_llm(T1 = row[base_col], A = labels['A'][1], B = labels['B'][1])
            # if i == 1:
            #   print(prompt)

            try:
                choice, pA = _query_llm_with_logprobs(prompt,model=judge_model ,client=client)
            except Exception as exc:
                print(f"Error querying LLM: {exc}")
                df.at[idx, out_choice] = f"error: {exc}"
                df.at[idx, out_prob]   = np.nan
                continue

            is_chunk2_A = labels["A"][0] == "chunk_2"
            is_chunk2_B = labels["B"][0] == "chunk_2"

            assert is_chunk2_A ^ is_chunk2_B, "Ouput should be one of A or B"

            if is_chunk2_A:
                p_chunk2 = pA
                winner = "chunk_2" if choice == "A" else rival
            else:
                p_chunk2 = 1 - pA
                winner = "chunk_2" if choice == "B" else rival
            df.at[idx, out_choice] = winner
            df.at[idx, out_prob]   = float(p_chunk2)
            if i%10 == 0:
              print(f"Row number {i} of {len(df)} for {row['domain']}")
              print(winner,float(p_chunk2) )

    return df

In [None]:
def process_batch(df_batch, client, rival_models, judge_model, batch_num, output_folder):
    """Processes a single batch and saves the results."""
    batch_output_path = os.path.join(output_folder, f"batch{batch_num}.parquet")
    if os.path.exists(batch_output_path):
        print(f"Batch {batch_num} already processed. Skipping.")
        return pd.read_parquet(batch_output_path)
    else:
        print(f"Processing batch {batch_num}...")
        llm_judge_batch_df = add_llm_judge_predictions(
            df=df_batch,
            client=client,
            rival_models=rival_models,
            judge_model=judge_model
        )
        llm_judge_batch_df.to_parquet(batch_output_path)
        print(f"Batch {batch_num} saved to {batch_output_path}")
        return llm_judge_batch_df


def process_batches(df, client, rival_models, judge_model, batch_size, output_folder):
  # Create output folder if it doesn't exist
  if not os.path.exists(output_folder):
      os.makedirs(output_folder)


  processed_batches = []
  for i in range(0, len(enriched_df), batch_size):
      batch_df = enriched_df.iloc[i:i + batch_size]
      batch_num = i // batch_size + 1
      processed_batch_df = process_batch(batch_df, client, rival_models, judge_model, batch_num, output_folder)
      processed_batches.append(processed_batch_df)

  return processed_batches



In [None]:
# Main processing logic
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
rival_models = ['gpt', 'llama']
judge_model = 'gpt-4o'
batch_size = 20  # Define your batch size
output_folder = '/content/drive/MyDrive/shared_data/llm_detection_data_preds_batched'
processed_batches = process_batches(enriched_df, client, rival_models, judge_model, batch_size, output_folder)

In [None]:
from typing_extensions import final
final_llm_judge_df = pd.concat(processed_batches, ignore_index=True)
print("All batches processed.")
final_llm_judge_df.to_parquet('/content/drive/MyDrive/shared_data/llm_detection_data_preds_final.parquet')