In [5]:
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

t5base_model = None

# Function to calculate sharpened cosine similarity
def sharpened_cosine_similarity(vec1, vec2, exponent=3):
    cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
    return cosine_similarity ** exponent

#provides similarity scores of a test_phrase against an array of phrases
def compare_phrases(test_phrase, phrases):
    global t5base_model
    t5base_model = SentenceTransformer('sentence-t5-base')
    if torch.cuda.is_available():
        t5base_model = t5base_model.to(torch.device("cuda"))
    model = t5base_model

    scores = []
    test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)

    for phrase in phrases:
        compare_embedding = model.encode(phrase, convert_to_tensor=True, show_progress_bar=False)
        score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()
        scores.append(score)

    return scores

In [24]:
mean_prompt = """Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style."""

df1 = pd.read_csv("./data/0401/nbroad-v2.csv")
df2 = pd.read_csv("./data/0401/gemma_suppl_rewrite.csv")
df3 = pd.read_csv("./data/0401/chatgpt_prompts.csv")
df4 = pd.read_csv("./data/0401/70k_gemma_template_built.csv")
prompts = list(set(
    df1['rewrite_prompt'].tolist() 
    + df2['rewrite_prompts'].tolist() 
    + df3['rewrite_prompt'].tolist()
    + df4['prompt'].tolist()
))

scores = compare_phrases(mean_prompt, prompts)
print(scores)

[0.6023841500282288, 0.5626571178436279, 0.540762186050415, 0.6486250162124634, 0.5804912447929382, 0.5250277519226074, 0.600343644618988, 0.5244511365890503, 0.5178366899490356, 0.5407268404960632, 0.5804808735847473, 0.6121029257774353, 0.5584560632705688, 0.633171796798706, 0.5762584805488586, 0.5437319874763489, 0.6129854917526245, 0.52012038230896, 0.5383003354072571, 0.5875656604766846, 0.561095118522644, 0.5497333407402039, 0.5531956553459167, 0.575480580329895, 0.498405396938324, 0.48270922899246216, 0.5848982334136963, 0.5559563040733337, 0.6335229873657227, 0.5851528644561768, 0.5488042831420898, 0.499215692281723, 0.5392624139785767, 0.6135519742965698, 0.5255184173583984, 0.6087932586669922, 0.5758660435676575, 0.5993449091911316, 0.5769421458244324, 0.5581408143043518, 0.6045089364051819, 0.5344499349594116, 0.5655802488327026, 0.5518800020217896, 0.525172770023346, 0.5439275503158569, 0.5657556653022766, 0.5545430779457092, 0.5237072110176086, 0.5025174617767334, 0.518009

In [30]:
from scipy import stats
import random

sorted_scores = sorted(scores)
a = random.sample(sorted_scores, 200)
print(stats.describe(a))
print(len(a))


DescribeResult(nobs=650, minmax=(0.3179638385772705, 0.7755288481712341), mean=0.5577316333238895, variance=0.0014601008094937376, skewness=-0.2117073925190482, kurtosis=4.735329024759366)
650


In [23]:
import numpy as np

scores_np = np.array(scores)
sorted_scores_np = np.sort(scores_np)[::-1]
sorted_scores_np
subset = []
max_length = 0

for i in range(len(scores_np)):
    for j in range(i, len(scores_np)):
        current_subset = scores_np[i:j+1]
        current_mean = np.mean(current_subset)
        
        if current_mean > 0.60 and current_mean < 0.65 and len(current_subset) > max_length:
            subset = current_subset
            max_length = len(current_subset)

subset


array([0.73257053, 0.47477695, 0.42390472, 0.69656402, 0.75117433,
       0.52197391])