In [11]:
snippets = [
    """def add(a, b): return a + b""",
    """class Person: ...""",
    # ... total 10 snippets
]



In [12]:
from sentence_transformers import SentenceTransformer

model_names = {
    "MiniLM": "all-MiniLM-L6-v2",
    "DistilRoBERTa": "all-distilroberta-v1",
    "MPNet": "all-mpnet-base-v2"
}

# Load models
models = {name: SentenceTransformer(path) for name, path in model_names.items()}

# Compute embeddings per snippet
embeddings_per_snippet = {name: [] for name in model_names}

for name, model in models.items():
    for snippet in snippets:
        emb = model.encode(snippet)
        embeddings_per_snippet[name].append(emb)



In [13]:
import numpy as np

# Find min dimension across all embeddings
min_dim = min(emb.shape[0] for name in embeddings_per_snippet for emb in embeddings_per_snippet[name])

# Truncate embeddings to min_dim
for name in embeddings_per_snippet:
    embeddings_per_snippet[name] = [emb[:min_dim] for emb in embeddings_per_snippet[name]]



In [14]:
from sklearn.metrics.pairwise import cosine_similarity

for i in range(len(snippets)):
    print(f"\nSnippet {i+1}:")
    for m1 in model_names:
        for m2 in model_names:
            emb1 = embeddings_per_snippet[m1][i].reshape(1, -1)
            emb2 = embeddings_per_snippet[m2][i].reshape(1, -1)
            sim = cosine_similarity(emb1, emb2)[0][0]
            print(f"{m1} vs {m2}: {sim:.4f}")




Snippet 1:
MiniLM vs MiniLM: 1.0000
MiniLM vs DistilRoBERTa: -0.0542
MiniLM vs MPNet: 0.0010
DistilRoBERTa vs MiniLM: -0.0542
DistilRoBERTa vs DistilRoBERTa: 1.0000
DistilRoBERTa vs MPNet: 0.1018
MPNet vs MiniLM: 0.0010
MPNet vs DistilRoBERTa: 0.1018
MPNet vs MPNet: 1.0000

Snippet 2:
MiniLM vs MiniLM: 1.0000
MiniLM vs DistilRoBERTa: 0.0230
MiniLM vs MPNet: -0.0001
DistilRoBERTa vs MiniLM: 0.0230
DistilRoBERTa vs DistilRoBERTa: 1.0000
DistilRoBERTa vs MPNet: -0.0104
MPNet vs MiniLM: -0.0001
MPNet vs DistilRoBERTa: -0.0104
MPNet vs MPNet: 1.0000


In [16]:
import pandas as pd

rows = []
for i in range(len(snippets)):
    for m1 in model_names:
        for m2 in model_names:
            rows.append({
                "Snippet": i+1,
                "Model1": m1,
                "Model2": m2,
                "Cosine_Similarity": cosine_similarity(
                    embeddings_per_snippet[m1][i].reshape(1, -1),
                    embeddings_per_snippet[m2][i].reshape(1, -1)
                )[0][0]
            })

df = pd.DataFrame(rows)
df.to_csv("snippet_similarity_per_snippet.csv", index=False)

