In [None]:
# --- Mount Google Drive and run full inter-prompt, row-level comparison ---
from google.colab import drive
drive.mount('/content/drive')

import os, itertools
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util

Mounted at /content/drive


In [None]:


# === CONFIGURATION ===
BASE_DIR = "/content/drive/Shareddrives/Danika-work4/2025/Crawler/MycoPapers_CleanCopy"   # <<-- CHANGE to your folder
PROMPTS = ["baseline", "few_shot", "cot"]         # top-level prompt folders
RUNS = ["1", "2", "3"]                            # subdirs for each run
OUT_FILE = os.path.join(BASE_DIR, "inter_prompt_rowlevel.csv")

# === MODEL ===
model = SentenceTransformer("all-MiniLM-L6-v2")

# === HELPERS ===
def read_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            lines = [l.strip() for l in f if l.strip()]
        return lines
    except Exception as e:
        print(f"⚠️ Error reading {path}: {e}")
        return []

def jaccard_similarity(text_a, text_b):
    """Compute Jaccard overlap of token sets."""
    if not text_a or not text_b:
        return 0
    vec = CountVectorizer(binary=True, stop_words="english").fit([" ".join(text_a), " ".join(text_b)])
    A, B = vec.transform([" ".join(text_a)]), vec.transform([" ".join(text_b)])
    inter = (A.multiply(B)).sum()
    union = A.sum() + B.sum() - inter
    return inter / union if union else 0

def list_papers(prompt):
    """List all papers present in all run subfolders for a given prompt."""
    papers = set()
    for r in RUNS:
        p = os.path.join(BASE_DIR, prompt, r)
        if os.path.exists(p):
            papers.update(os.path.splitext(f)[0] for f in os.listdir(p) if f.endswith(".txt"))
    return papers

# === DETECT COMMON PAPERS ===
common_papers = set.intersection(*(list_papers(p) for p in PROMPTS))
print(f"✅ Found {len(common_papers)} papers common to all prompts\n")

# === MAIN LOOP ===
records = []
for paper in tqdm(sorted(common_papers)):
    for pa, pb in itertools.combinations(PROMPTS, 2):
        cos_sims, jac_sims = [], []
        for ra, rb in itertools.product(RUNS, RUNS):
            fa = os.path.join(BASE_DIR, pa, ra, f"{paper}.txt")
            fb = os.path.join(BASE_DIR, pb, rb, f"{paper}.txt")
            if not (os.path.exists(fa) and os.path.exists(fb)):
                continue

            ta, tb = read_file(fa), read_file(fb)
            if not ta or not tb:
                continue

            # Compute cosine similarity of embeddings
            emb = model.encode([" ".join(ta), " ".join(tb)], convert_to_tensor=True)
            cos = util.cos_sim(emb[0], emb[1]).item()
            jac = jaccard_similarity(ta, tb)

            cos_sims.append(cos)
            jac_sims.append(jac)

        if cos_sims:
            records.append({
                "paper": paper,
                "pair": f"{pa}_vs_{pb}",
                "cosine_mean": sum(cos_sims)/len(cos_sims),
                "cosine_std": pd.Series(cos_sims).std(),
                "jaccard_mean": sum(jac_sims)/len(jac_sims),
                "jaccard_std": pd.Series(jac_sims).std(),
                "n_comparisons": len(cos_sims)
            })

# === SAVE RESULTS ===
out = pd.DataFrame(records)
out.to_csv(OUT_FILE, index=False)

print("\n✅ Inter-prompt row-level similarity complete.")
print("Results saved to:", OUT_FILE)
print("\nMean similarities by prompt pair:")
print(out.groupby("pair")[["cosine_mean","jaccard_mean"]].mean().round(3))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Found 91 papers common to all prompts



100%|██████████| 91/91 [17:02<00:00, 11.24s/it]


✅ Inter-prompt row-level similarity complete.
Results saved to: /content/drive/Shareddrives/Danika-work4/2025/Crawler/MycoPapers_CleanCopy/inter_prompt_rowlevel.csv

Mean similarities by prompt pair:
                      cosine_mean  jaccard_mean
pair                                           
baseline_vs_cot             0.906         0.582
baseline_vs_few_shot        0.904         0.558
few_shot_vs_cot             0.916         0.608





In [None]:
# ==============================================
# 📊 Inter-Prompt Row-Level Extraction Analysis
# ==============================================

import pandas as pd

# --- CONFIGURATION ---
in_file = "/content/drive/Shareddrives/Danika-work4/2025/Crawler/MycoPapers_CleanCopy/inter_prompt_rowlevel.csv"  # 👈 update path if needed
out_file = "/content/drive/Shareddrives/Danika-work4/2025/Crawler/MycoPapers_CleanCopy/inter_prompt_rowlevel_summary.csv"

# --- LOAD DATA ---
df = pd.read_csv(in_file)

# --- CLEANUP ---
# Ensure numeric columns are properly typed
for col in ["cosine_mean", "cosine_std", "jaccard_mean", "jaccard_std"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# --- AGGREGATE BY PROMPT PAIR ---
summary = (
    df.groupby("pair")
      .agg(
          mean_cosine_mean=("cosine_mean", "mean"),
          sd_cosine_mean=("cosine_mean", "std"),
          mean_jaccard_mean=("jaccard_mean", "mean"),
          sd_jaccard_mean=("jaccard_mean", "std"),
          mean_cosine_std=("cosine_std", "mean"),
          mean_jaccard_std=("jaccard_std", "mean"),
          total_comparisons=("n_comparisons", "sum")
      )
      .reset_index()
      .sort_values(by="mean_cosine_mean", ascending=False)
)

# --- SAVE RESULTS ---
summary.to_csv(out_file, index=False)
print(f"✅ Saved summary to: {out_file}\n")

# --- DISPLAY RESULTS ---
print("=== Inter-Prompt Row-Level Similarity Summary ===\n")
print(summary.round(3))
print("\n--------------------------------------------------")

# --- INTERPRETATION ---
top_pair = summary.iloc[0]
lowest_pair = summary.iloc[-1]

print(f"""
Interpretation Summary
----------------------
• The highest overall semantic similarity (cosine) is between **{top_pair['pair']}**
  (mean cosine ≈ {top_pair['mean_cosine_mean']:.3f}, Jaccard ≈ {top_pair['mean_jaccard_mean']:.3f}),
  indicating these two prompts extract highly similar information semantically.

• The lowest similarity is between **{lowest_pair['pair']}**
  (cosine ≈ {lowest_pair['mean_cosine_mean']:.3f}), suggesting greater stylistic or interpretive
  divergence between those prompt formulations.

• Across all pairs, cosine values above ~0.8 generally imply strong semantic overlap,
  while Jaccard values in the 0.3–0.5 range indicate moderate surface-form variability
  (differences in phrasing or structure, not content).

• Low cosine_std and jaccard_std values reflect consistency across papers,
  meaning that differences between prompts are stable and systematic rather than random.

These findings suggest that while the three prompt styles differ stylistically,
they extract substantively similar data from the source papers,
supporting their interchangeability for factual extraction tasks.
""")


✅ Saved summary to: /content/drive/Shareddrives/Danika-work4/2025/Crawler/MycoPapers_CleanCopy/inter_prompt_rowlevel_summary.csv

=== Inter-Prompt Row-Level Similarity Summary ===

                   pair  mean_cosine_mean  sd_cosine_mean  mean_jaccard_mean  \
2       few_shot_vs_cot             0.916           0.034              0.608   
0       baseline_vs_cot             0.906           0.033              0.582   
1  baseline_vs_few_shot             0.904           0.038              0.558   

   sd_jaccard_mean  mean_cosine_std  mean_jaccard_std  total_comparisons  
2            0.134            0.027             0.073                779  
0            0.123            0.031             0.069                804  
1            0.115            0.030             0.067                792  

--------------------------------------------------

Interpretation Summary
----------------------
• The highest overall semantic similarity (cosine) is between **few_shot_vs_cot**
  (mean cosine ≈ 