In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import re

def strip_to_assistant_output(text: str):
    """
    From a blob like:
        'Paragraph:\\n...\\nassistant\\nMODEL_OUTPUT'
    keep only `MODEL_OUTPUT`.
    If there's no 'assistant' marker, return the original text.
    """
    if pd.isna(text):
        return text

    t = str(text)

    # Split on a line that is just 'assistant' (case-insensitive)
    parts = re.split(r'\bassistant\b\s*[\r\n]+', t, maxsplit=1, flags=re.IGNORECASE)
    if len(parts) == 2:
        return parts[1].strip()

    # Fallback: no 'assistant' → just return original stripped
    return t.strip()


In [None]:
# 1) GPT-4 file (in case any rows have the same pattern)
path_gpt4 = "/content/drive/MyDrive/test_dataset_mode_transfer_cleaned.csv"
df_gpt4 = pd.read_csv(path_gpt4)

df_gpt4["output_gpt4"] = df_gpt4["output_gpt4"].apply(strip_to_assistant_output)

df_gpt4.to_csv(path_gpt4, index=False)
print("Cleaned GPT-4:", path_gpt4)


# 2) LoRA file  (column name is **output**)
path_lora = "/content/drive/MyDrive/new_test_with_lora.csv"
df_lora = pd.read_csv(path_lora)

df_lora["output"] = df_lora["output"].apply(strip_to_assistant_output)

df_lora.to_csv(path_lora, index=False)
print("Cleaned LoRA:", path_lora)


# 3) Qwen base file  
path_qwen = "/content/drive/MyDrive/test_eval_qwen_base.csv"
df_qwen = pd.read_csv(path_qwen)

df_qwen["output_qwen_base"] = df_qwen["output_qwen_base"].apply(strip_to_assistant_output)

df_qwen.to_csv(path_qwen, index=False)
print("Cleaned Qwen base:", path_qwen)


Cleaned GPT-4: /content/drive/MyDrive/test_dataset_mode_transfer_cleaned.csv
Cleaned LoRA: /content/drive/MyDrive/new_test_with_lora.csv
Cleaned Qwen base: /content/drive/MyDrive/test_eval_qwen_base.csv


In [None]:
 import pandas as pd
from sentence_transformers import SentenceTransformer, util

 #---------- 1. Load the sentence-transformer model once ----------
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def add_content_similarity(
    csv_path: str,
    orig_col: str,
    rewr_col: str,
    save_path: str = None,
    new_col: str = "content_similarity",
):
    """
    Loads a CSV, computes cosine similarity between orig_col and rewr_col
    using a sentence-transformer model, and adds it as `new_col`.
    Returns the modified dataframe.
    """
    print(f"\n=== Processing: {csv_path} ===")
    df = pd.read_csv(csv_path)

    # Drop rows with missing text
    df_valid = df.dropna(subset=[orig_col, rewr_col]).copy()
    print(f"Rows before NA drop: {len(df)}, after: {len(df_valid)}")

    orig_texts = df_valid[orig_col].tolist()
    rewr_texts = df_valid[rewr_col].tolist()

    # Encode original and rewritten texts
    orig_embs = model.encode(
        orig_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )
    rewr_embs = model.encode(
        rewr_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )

    # Pairwise cosine similarity (diagonal of full matrix)
    similarities = util.cos_sim(orig_embs, rewr_embs).diagonal()
    df_valid[new_col] = similarities.cpu().numpy()

    # Merge back into original df (in case some rows were dropped)
    df[new_col] = df_valid[new_col]

    # Print summary stats
    print(df_valid[new_col].describe())

    
    if save_path is not None:
        df.to_csv(save_path, index=False)
        print(f"Saved with {new_col} to: {save_path}")

    return df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# ---------- 1. Load the sentence-transformer model once ----------
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def add_content_similarity(
    csv_path: str,
    orig_col: str,
    rewr_col: str,
    save_path: str = None,
    new_col: str = "content_similarity",
):
    """
    Loads a CSV, computes cosine similarity between orig_col and rewr_col
    using a sentence-transformer model, and adds it as `new_col`.
    Returns the modified dataframe.
    """
    print(f"\n=== Processing: {csv_path} ===")
    df = pd.read_csv(csv_path)

    # Drop rows with missing text
    df_valid = df.dropna(subset=[orig_col, rewr_col]).copy()
    print(f"Rows before NA drop: {len(df)}, after: {len(df_valid)}")

    orig_texts = df_valid[orig_col].tolist()
    rewr_texts = df_valid[rewr_col].tolist()

    # Encode original and rewritten texts
    orig_embs = model.encode(
        orig_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )
    rewr_embs = model.encode(
        rewr_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )

    # Pairwise cosine similarity (diagonal of full matrix)
    similarities = util.cos_sim(orig_embs, rewr_embs).diagonal()
    df_valid[new_col] = similarities.cpu().numpy()

    # Merge back into original df (in case some rows were dropped)
    df[new_col] = df_valid[new_col]

    # Print summary stats
    print(df_valid[new_col].describe())

    # Optionally save
    if save_path is not None:
        df.to_csv(save_path, index=False)
        print(f"Saved with {new_col} to: {save_path}")

    return df

# ---------- 2. Apply to three datasets ----------

# 1) ChatGPT-4.0 outputs
df_gpt4 = add_content_similarity(
    csv_path="/content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4.csv",
    orig_col="raw_content",          # source paragraph column
    rewr_col="output_gpt4",          
    save_path="/content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv",
)

# 2) LoRA fine-tuned model outputs
df_lora = add_content_similarity(
    csv_path="/content/drive/MyDrive/eval_with_lora.csv",
    orig_col="raw_content",
    rewr_col="output",          
    save_path="/content/drive/MyDrive/eval_with_lora_with_sim.csv",
)

# 3) Qwen base model outputs
df_qwen_base = add_content_similarity(
    csv_path="/content/drive/MyDrive/external_genre_validation_400_with_targets_qwen_base.csv",
    orig_col="raw_content",
    rewr_col="output_qwen_base",     
    save_path="/content/drive/MyDrive/external_genre_validation_400_with_tragets_qwen_base_with_sim.csv",
)

# ---------- 3. Quick comparison of average content similarity ----------

print("\n=== Mean content similarity comparison ===")
print("GPT-4.0:    ", df_gpt4['content_similarity'].mean())
print("LoRA model: ", df_lora['content_similarity'].mean())
print("Qwen base:  ", df_qwen_base['content_similarity'].mean())



=== Processing: /content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4.csv ===
Rows before NA drop: 400, after: 400
count    400.000000
mean       0.784204
std        0.127696
min        0.330325
25%        0.706197
50%        0.825155
75%        0.878716
max        0.962491
Name: content_similarity, dtype: float64
Saved with content_similarity to: /content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv

=== Processing: /content/drive/MyDrive/eval_with_lora.csv ===
Rows before NA drop: 400, after: 400
count    400.000000
mean       0.764088
std        0.181613
min        0.124340
25%        0.694524
50%        0.825268
75%        0.894697
max        1.000000
Name: content_similarity, dtype: float64
Saved with content_similarity to: /content/drive/MyDrive/eval_with_lora_with_sim.csv

=== Processing: /content/drive/MyDrive/external_genre_validation_400_with_targets_qwen_base.csv ===
Rows before NA drop: 400, after: 400
count    400.000000
mean       0.773067
std       

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# ---------- 1. Load the sentence-transformer model once ----------
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def add_content_similarity(
    csv_path: str,
    orig_col: str,
    rewr_col: str,
    save_path: str = None,
    new_col: str = "content_similarity",
):
    """
    Loads a CSV, computes cosine similarity between orig_col and rewr_col
    using a sentence-transformer model, and adds it as `new_col`.
    Returns the modified dataframe.
    """
    print(f"\n=== Processing: {csv_path} ===")
    df = pd.read_csv(csv_path)

    # Drop rows with missing text
    df_valid = df.dropna(subset=[orig_col, rewr_col]).copy()
    print(f"Rows before NA drop: {len(df)}, after: {len(df_valid)}")

    orig_texts = df_valid[orig_col].tolist()
    rewr_texts = df_valid[rewr_col].tolist()

    # Encode original and rewritten texts
    orig_embs = model.encode(
        orig_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )
    rewr_embs = model.encode(
        rewr_texts,
        batch_size=32,
        convert_to_tensor=True,
        normalize_embeddings=True,
    )

    # Pairwise cosine similarity (diagonal of full matrix)
    similarities = util.cos_sim(orig_embs, rewr_embs).diagonal()
    df_valid[new_col] = similarities.cpu().numpy()

    # Merge back into original df (in case some rows were dropped)
    df[new_col] = df_valid[new_col]

    # Print summary stats
    print(df_valid[new_col].describe())

    # Optionally save
    if save_path is not None:
        df.to_csv(save_path, index=False)
        print(f"Saved with {new_col} to: {save_path}")

    return df

# ---------- 2. Apply to your three datasets ----------

# 1) ChatGPT-4.0 outputs
df_gpt4 = add_content_similarity(
    csv_path="/content/drive/MyDrive/test_dataset_mode_transfer_cleaned.csv",
    orig_col="raw_content",          # source paragraph column
    rewr_col="output_gpt4",          
    save_path="/content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv",
)

# 2) LoRA fine-tuned model outputs
df_lora = add_content_similarity(
    csv_path="/content/drive/MyDrive/new_test_with_lora.csv",
    orig_col="raw_content",
    rewr_col="output",          
    save_path="/content/drive/MyDrive/new_test_with_lora_with_sim.csv",
)

# 3) Qwen base model outputs
df_qwen_base = add_content_similarity(
    csv_path="/content/drive/MyDrive/test_eval_qwen_base.csv",
    orig_col="raw_content",
    rewr_col="output_qwen_base",     
    save_path="/content/drive/MyDrive/test_eval_qwen_base_with_sim.csv",
)

# ---------- 3. Quick comparison of average content similarity ----------

print("\n=== Mean content similarity comparison ===")
print("GPT-4.0:    ", df_gpt4['content_similarity'].mean())
print("LoRA model: ", df_lora['content_similarity'].mean())
print("Qwen base:  ", df_qwen_base['content_similarity'].mean())



=== Processing: /content/drive/MyDrive/test_dataset_mode_transfer_cleaned.csv ===
Rows before NA drop: 192, after: 192
count    192.000000
mean       0.802684
std        0.101832
min        0.453865
25%        0.764214
50%        0.819607
75%        0.880220
max        0.959604
Name: content_similarity, dtype: float64
Saved with content_similarity to: /content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv

=== Processing: /content/drive/MyDrive/new_test_with_lora.csv ===
Rows before NA drop: 192, after: 192
count    192.000000
mean       0.767604
std        0.141400
min        0.218418
25%        0.705887
50%        0.803855
75%        0.866683
max        0.996067
Name: content_similarity, dtype: float64
Saved with content_similarity to: /content/drive/MyDrive/new_test_with_lora_with_sim.csv

=== Processing: /content/drive/MyDrive/test_eval_qwen_base.csv ===
Rows before NA drop: 192, after: 192
count    192.000000
mean       0.782222
std        0.141910
min        0.19