In [43]:
import csv
import random

from datasets import load_dataset, concatenate_datasets
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
verified_camel = load_dataset("LDJnr/Verified-Camel", split="train")
pure_dove = load_dataset("LDJnr/Pure-Dove", split="train")
llm_eval = load_dataset("shahules786/llm-eval", split="train")
im_feeling_curious = load_dataset("xiyuez/im-feeling-curious", split="train")

Downloading readme:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/643k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [36]:
verified_camel_examples = [
    (conversation[0]["input"], conversation[0]["output"])
    for conversation in verified_camel["conversation"]
]

pure_dove_examples = [
    (conversation[0]["input"], conversation[0]["output"])
    for conversation in pure_dove["conversation"]
]

llm_eval_examples = list(zip(llm_eval["instruction"], llm_eval["response"]))

im_feeling_curious_examples = list(zip(im_feeling_curious["question"], im_feeling_curious["answer"]))

In [37]:
stop_words = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

# Preprocessing
def preprocess(text):
    tokens = word_tokenize(text)
    filtered = [porter_stemmer.stem(w) for w in tokens if not w in stop_words]
    return " ".join(filtered)

def cluster(data: list[str], n_clusters: int):
    # Preprocess data
    preprocessed_data = [preprocess(text) for text in data]

    # Vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(preprocessed_data)

    # Clustering
    model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)

    # Get cluster labels for each sentence
    labels = model.labels_

    # Create a DataFrame with sentences and their corresponding cluster labels
    df = pd.DataFrame({'Sentence': data, 'Cluster': labels})

    # Group by cluster labels to get indices of sentences in each cluster
    clusters = df.groupby('Cluster').groups

    cluster_indices = [
        indices.tolist()
        for cluster, indices in clusters.items()
    ]

    return cluster_indices

In [33]:
def pick_indices(data, indices):
    return [
        data[i]
        for i in indices
    ]

In [41]:
num_samples = 20

examples = [
    example
    for dataset in [
        verified_camel_examples,
        pure_dove_examples,
        llm_eval_examples,
        im_feeling_curious_examples,
    ]
    for example in pick_indices(
        dataset, 
        list(map(random.choice, cluster([
            input
            for input, output in dataset
        ], num_samples))),
    )
]

In [46]:
samples_to_add = [
    {
        "Refusal": "False",
        "System": "",
        "User": input,
        "Assistant": output,
    }
    for input, output in examples
]

In [47]:
with open('../prompts.csv', 'a', newline='') as f:
    writer = csv.writer(f)
    for sample in samples_to_add:
        writer.writerow([sample["Refusal"], sample["System"], sample["User"], sample["Assistant"]])


In [None]:
with open('../prompts.csv', 'r', newline='') as f:
    reader = csv.reader(f)
    result = [k for k in reader]

result[-20:]

[['False',
  '',
  'What team has won the most Rose Bowls?',
  'USC has played the most times in the Rose Bowl, with 33 appearances, followed by Michigan (20); Ohio State, Stanford, and Washington (14 each); and UCLA (12). Alabama, 4–1–1 in Rose Bowls, has made the most appearances of any team outside the Pac-12 and Big Ten conferences.'],
 ['False',
  '',
  'How fast does a 22 caliber bullet travel?',
  'Many .22 LR cartridges use bullets lighter than the standard 40 gr, fired at even higher velocities. Hyper-velocity bullets usually weigh around 30 to 32 gr (1.9 to 2.1 g) and can have a muzzle velocity of 1,400 to 1,800 feet per second (430 to 550 m/s).'],
 ['False',
  '',
  'Why are the atomic masses listed on the periodic table not whole numbers?',
  'The atomic mass reported on a periodic table is the weighted average of all the naturally occuring isotopes. Being an average it would be most unlikely to be a whole number. The mass of an individual atom in atomic mass units is the m