In [1]:
from openai import OpenAI

import json
import os
import numpy as np
from tqdm import tqdm
from datasets import DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import faiss

In [2]:
dataset = load_dataset('csv', data_files='..\data\dataset\processed\clean_data_gpt2.csv')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'essay', 'label'],
        num_rows: 9766
    })
})


split the train and test part for 9:1

In [4]:
split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'essay', 'label'],
        num_rows: 8789
    })
    test: Dataset({
        features: ['prompt', 'essay', 'label'],
        num_rows: 977
    })
})


Now, start to create embedding database

In [5]:
client = OpenAI()

In [9]:
def generate_embedding(text, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding")
        return None

In [6]:
def generate_embeddings_batch(texts, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=texts, model=model)
        return [item.embedding for item in response.data]
    except Exception as e:
        print(f"Error generating embeddings for batch: {e}")
        return None

In [7]:
embeddings = []
metadata = []


batch_size = 100  

for i in tqdm(range(0, len(split_dataset['train']), batch_size), desc="Generating embeddings"):
    batch_samples = split_dataset['train'][i:i+batch_size]

    
    
    combined_texts = [
        f"Prompt: {prompt}\nEssay: {essay}"
        for prompt, essay in zip(batch_samples['prompt'], batch_samples['essay'])
    ]
    
    batch_embeddings = generate_embeddings_batch(combined_texts)
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        metadata.extend([
            {"prompt": prompt, "essay": essay, "label": label}
            for prompt, essay, label in zip(
                batch_samples['prompt'], batch_samples['essay'], batch_samples['label']
            )
        ])
    

# save the embeddings

embeddings_np = np.array(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings_np)  

dimension = len(embeddings_np[0])  
index = faiss.IndexFlatIP(dimension)  
index.add(embeddings_np)  

# save the reults
faiss.write_index(index, "faiss_index_train.bin")


with open("embeddings_dataset_train.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)

Generating embeddings: 100%|██████████| 88/88 [03:27<00:00,  2.35s/it]


In [None]:
index = faiss.read_index("faiss_index_train.bin")
with open("embeddings_dataset_train.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)


def search_cosine_similarity(query_text, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = metadata[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results


query = """Prompt: some people believe that eventually all jobs will be done by artificially intelligence robots.
what is your opinions?
  Essay: Hello dear, Long time, am happy to hear from you, because even me i have missed you too, 
  Since you left i been lonely even to work around in the evening as we used to do, going to church, market and so on, 
  it has not been easy for me, Hearing from you show me that our relationship is still intact. Based on your request, 
  i have told you time without number that anywhere you find yourself try to adapt, i know it will not going to be easy with two of us but you have to same to me. 
  I will suggest for you to join school club, it will keep you busy both soul and body, you know how student life is at list every Friday is club neither in the school 
  environment or outside the school, apart from joining club, try to engage yourself also in school choir which i know you know how to sing very well, While singing i 
  believe you will forget some of our movement and sleepover we used to do. So also to be participating in some school activities like playing valley ball, which you know 
  that you have the height, so try to make use of your height in other to make your self happy, playing with one or two people in the valley ball pitch one day you will make a new friend, 
  and may be you may not even remember again. Last i will try to come and see you by next week. I wish you well see you then bye.
"""
results = search_cosine_similarity(query, top_k=3)

for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"  Prompt: {result['prompt']}")
    print(f"  Essay: {result['essay']}")
    print(f"  Label: {result['label']}")
    print(f"  Similarity: {result['similarity']}")
    print()

Result 1:
  Prompt: some people believe that eventually all jobs will be done by artificially intelligence robots.
what is your opinions?
  Essay: Hello dear, Long time, am happy to hear from you, because even me i have missed you too, Since you left i been lonely even to work around in the evening as we used to do, going to church, market and so on, it has not been easy for me, Hearing from you show me that our relationship is still intact. Based on your request, i have told you time without number that anywhere you find yourself try to adapt, i know it will not going to be easy with two of us but you have to same to me. I will suggest for you to join school club, it will keep you busy both soul and body, you know how student life is at list every Friday is club neither in the school environment or outside the school, apart from joining club, try to engage yourself also in school choir which i know you know how to sing very well, While singing i believe you will forget some of our mov

In [None]:
# embeddings = []
# metadata = []
# for sample in tqdm(split_dataset['train'], desc="Generating embeddings"):
#     combined_text = f"Prompt: {sample['prompt']}\nEssay: {sample['essay']}"
    
#     embedding = generate_embedding(combined_text)
#     if embedding is not None:
#         embeddings.append(embedding)
#         metadata.append({
#             "prompt": sample["prompt"],
#             "essay": sample["essay"],
#             "label": sample["label"]
#         })

                         

# # save the embeddings

# embeddings_np = np.array(embeddings, dtype=np.float32)
# faiss.normalize_L2(embeddings_np)  

# dimension = len(embeddings_np[0])  
# index = faiss.IndexFlatIP(dimension)  
# index.add(embeddings_np)  

# # save the reults
# faiss.write_index(index, "faiss_index.bin")


# with open("embeddings_dataset.json", "w", encoding="utf-8") as f:
#     json.dump(metadata, f, ensure_ascii=False, indent=4)



Generating embeddings:  15%|█▌        | 1359/8789 [04:35<19:43,  6.28it/s]  