In [1]:
from openai import OpenAI

import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import faiss

In [2]:
dataset = load_dataset('csv', data_files='..\data\dataset\processed\clean_data_gpt2.csv')
print(type(dataset))
dataset = pd.DataFrame(dataset["train"])
print(dataset)

label_col = dataset["label"]
print(label_col)

<class 'datasets.dataset_dict.DatasetDict'>
                                                 prompt  \
0     Rich countries often give money to poorer coun...   
1     Rich countries often give money to poorer coun...   
2     Some countries achieve international sports by...   
3     Some countries achieve international sports by...   
4     Some countries achieve international sports by...   
...                                                 ...   
9761  Some people think that in the modern world we ...   
9762  Some people think that in the modern world we ...   
9763  Some people think that in the morden world we ...   
9764  Some people think that in the modern world we ...   
9765  Some people think that in the modern world we ...   

                                                  essay  label  
0     Poverty represents a worldwide crisis. It is t...      6  
1     Human beings are facing many challenges nowada...      4  
2     Whether countries should only invest facilitie

split the train and test part for 9:1

In [3]:
train_data, test_data = train_test_split(dataset, test_size=0.1, stratify=label_col, random_state=42)
split_dataset = dict()
split_dataset["train"] = train_data
split_dataset["test"] = test_data
# print(split_dataset)

Now, start to create embedding database

In [4]:
client = OpenAI()

In [5]:
def generate_embedding(text, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding")
        return None

In [6]:
def generate_embeddings_batch(texts, model="text-embedding-ada-002"):
    try:
        response = client.embeddings.create(input=texts, model=model)
        return [item.embedding for item in response.data]
    except Exception as e:
        print(f"Error generating embeddings for batch: {e}")
        return None

In [7]:
embeddings = []
metadata = []


batch_size = 100  

for i in tqdm(range(0, len(split_dataset['train']), batch_size), desc="Generating embeddings"):
    batch_samples = split_dataset['train'][i:i+batch_size]

    
    
    combined_texts = [
        f"Prompt: {prompt}\nEssay: {essay}"
        for prompt, essay in zip(batch_samples['prompt'], batch_samples['essay'])
    ]
    
    batch_embeddings = generate_embeddings_batch(combined_texts)
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        metadata.extend([
            {"prompt": prompt, "essay": essay, "label": label}
            for prompt, essay, label in zip(
                batch_samples['prompt'], batch_samples['essay'], batch_samples['label']
            )
        ])
    

# save the embeddings

embeddings_np = np.array(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings_np)  

dimension = len(embeddings_np[0])  
index = faiss.IndexFlatIP(dimension)  
index.add(embeddings_np)  

# save the reults
faiss.write_index(index, "faiss_index_train.bin")


with open("embeddings_dataset_train.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)

Generating embeddings: 100%|██████████| 88/88 [03:27<00:00,  2.35s/it]


RAG using topics

In [7]:
embeddings = []
metadata = []


batch_size = 100  

for i in tqdm(range(0, len(split_dataset['train']), batch_size), desc="Generating embeddings"):
    batch_samples = split_dataset['train'][i:i+batch_size]

    
    
    topics = [
        f"{prompt}"
        for prompt in batch_samples['prompt']
    ]
    
    batch_embeddings = generate_embeddings_batch(topics)
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        metadata.extend([
            {"prompt": prompt, "essay": essay, "label": label}
            for prompt, essay, label in zip(
                batch_samples['prompt'], batch_samples['essay'], batch_samples['label']
            )
        ])
    

# save the embeddings

embeddings_np = np.array(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings_np)  

dimension = len(embeddings_np[0])  
index = faiss.IndexFlatIP(dimension)  
index.add(embeddings_np)  

# save the reults
faiss.write_index(index, "faiss_index_train_topics.bin")


with open("embeddings_dataset_train_topics.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)

Generating embeddings: 100%|██████████| 88/88 [01:05<00:00,  1.35it/s]


using test samples

In [8]:
embeddings = []
metadata = []


batch_size = 100  

for i in tqdm(range(0, len(split_dataset['test']), batch_size), desc="Generating embeddings"):
    batch_samples = split_dataset['test'][i:i+batch_size]

    
    
    topics = [
        f"{prompt}"
        for prompt in batch_samples['prompt']
    ]
    
    batch_embeddings = generate_embeddings_batch(topics)
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        metadata.extend([
            {"prompt": prompt, "essay": essay, "label": label}
            for prompt, essay, label in zip(
                batch_samples['prompt'], batch_samples['essay'], batch_samples['label']
            )
        ])
    

# save the embeddings

embeddings_np = np.array(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings_np)  

dimension = len(embeddings_np[0])  
index = faiss.IndexFlatIP(dimension)  
index.add(embeddings_np)  

# save the reults
faiss.write_index(index, "faiss_index_test_topics.bin")


with open("embeddings_dataset_test_topics.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)

Generating embeddings: 100%|██████████| 10/10 [00:08<00:00,  1.18it/s]


In [9]:
index = faiss.read_index("faiss_index_train_topics.bin")
with open("embeddings_dataset_train_topics.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)


def search_cosine_similarity(query_text, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = metadata[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results


query = """Many people use social media to keep in touch with other people and for news events. Do the advantages of this outweigh the disadvantages?
"""
results = search_cosine_similarity(query, top_k=3)

for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"  Prompt: {result['prompt']}")
    print(f"  Essay: {result['essay']}")
    print(f"  Label: {result['label']}")
    print(f"  Similarity: {result['similarity']}")
    print()

Result 1:
  Prompt: Many people use social media to keep in touch with other people and for news events. Do the advantages of this outweigh the disadvantages?
  Essay: In the modern era, the masses are frequently using social media platforms like Instagram, Facebook and YouTube to contact other folks and to keep themselves updated about current affairs around the world. I believe the pros of this phenomenon will outbalance the cons, as we will see in this essay.

To begin with, the tendency of making use of social media these days has many advantages for people dwelling on different continents. With aid of these online websites and ,applications one can confabulate with their loved ones residing in some corner of this gigantic world. Now, the public can stay in touch with their families and friends with the snap of their fingers. This idea has lessened the communication barriers and was considered a dream of the past which has now become a stark reality owing to the advancement in scie

In [None]:
# embeddings = []
# metadata = []
# for sample in tqdm(split_dataset['train'], desc="Generating embeddings"):
#     combined_text = f"Prompt: {sample['prompt']}\nEssay: {sample['essay']}"
    
#     embedding = generate_embedding(combined_text)
#     if embedding is not None:
#         embeddings.append(embedding)
#         metadata.append({
#             "prompt": sample["prompt"],
#             "essay": sample["essay"],
#             "label": sample["label"]
#         })

                         

# # save the embeddings

# embeddings_np = np.array(embeddings, dtype=np.float32)
# faiss.normalize_L2(embeddings_np)  

# dimension = len(embeddings_np[0])  
# index = faiss.IndexFlatIP(dimension)  
# index.add(embeddings_np)  

# # save the reults
# faiss.write_index(index, "faiss_index.bin")


# with open("embeddings_dataset.json", "w", encoding="utf-8") as f:
#     json.dump(metadata, f, ensure_ascii=False, indent=4)



Generating embeddings:  15%|█▌        | 1359/8789 [04:35<19:43,  6.28it/s]  

In [5]:
import time
index = faiss.read_index("faiss_index_train.bin")
with open("embeddings_dataset_test.json", "r", encoding="utf-8") as f:
    metadata_test = json.load(f)

with open("embeddings_dataset_train.json", "r", encoding="utf-8") as f:
    metadata_train = json.load(f)

def search_cosine_similarity(query_text, dataset, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = dataset[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results

correct = 0
cnt = 0
timeout_duration = 10
for data in metadata_test:
    query = f"Prompt: {data['prompt']}\nEssay: {data['essay']}"
    results = search_cosine_similarity(query, metadata_train, top_k=2)

    answer = float(data['label'])
    curr_score = 0
    weight = [0.8, 0.2]
    for i, result in enumerate(results):
        curr_score += result['label']*weight[i]

    # print(curr_score)
    print(cnt)


    if (abs(answer - curr_score) <= 1):
        correct += 1
    
    cnt += 1
    if cnt == 150:
        break
    # print("=====================")

print(f"Accuracy: {correct/cnt}")




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
Accuracy: 0.18666666666666668


# RAG with topic_essay combined similarity

create embedding database using essay content

In [10]:
embeddings = []
metadata = []


batch_size = 100  

for i in tqdm(range(0, len(split_dataset['train']), batch_size), desc="Generating embeddings"):
    batch_samples = split_dataset['train'][i:i+batch_size]

    
    
    essay = [
        f"{essay}"
        for essay in batch_samples['essay']
    ]
    
    batch_embeddings = generate_embeddings_batch(essay)
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        metadata.extend([
            {"prompt": prompt, "essay": essay, "label": label}
            for prompt, essay, label in zip(
                batch_samples['prompt'], batch_samples['essay'], batch_samples['label']
            )
        ])
    

# save the embeddings

embeddings_np = np.array(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings_np)  

dimension = len(embeddings_np[0])  
index = faiss.IndexFlatIP(dimension)  
index.add(embeddings_np)  

# save the reults
faiss.write_index(index, "faiss_index_train_essay.bin")


with open("embeddings_dataset_train_essay.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)

Generating embeddings: 100%|██████████| 88/88 [03:02<00:00,  2.08s/it]


test

In [11]:
index = faiss.read_index("faiss_index_train_essay.bin")
with open("embeddings_dataset_train_essay.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)


def search_cosine_similarity(query_text, top_k=3):
    
    query_embedding = generate_embedding(query_text)
    if query_embedding is None:
        return []
    
    
    query_embedding_np = np.array([query_embedding], dtype=np.float32)
    faiss.normalize_L2(query_embedding_np)
    
    
    distances, indices = index.search(query_embedding_np, top_k)
    
    
    results = []
    for i, idx in enumerate(indices[0]):
        result = metadata[idx]
        result["similarity"] = distances[0][i]  
        results.append(result)
    return results


query = """ "Some people prefers living in a stable condidtion. They were doing almost the same routine every day, going to the same bookshop or gorceries on weekend and same things to do at work. There are not much changes in their lives. Others like to living in the changing environment. They keep trying new things, go to the different restrauants or meeting new friends. Sometimes they even challenging themselves by moving to other positions at work.\r\nI agree with both views because these two views are not conflict. One can meeting new friends in the same coffee store or can meeting old friends in different coffee sotres. The change and unchange only represent some parts of their lives. Like the aforementioned example, one can go to the same gorcery shop every weekend but go out with different friends at Saturday night. Or one can go to different cafes but to have the same order of coffee.\r\nThe change and unchage are exist in each other. The world is changing all the time that no one can walking in the same street twice. This is a philosephy saying. Because as time flows the stree that you stepped on was no longer the same street, the leaves may change or the tree may changeBut if that is the same path you walk to company every day, it is the same street in your plan. That means the street is relatively changed/unchaged. On one can stop the world changing, and what can be unchange is ourself, or our minds. We can have the same mindset everyday in different environments."

"""
results = search_cosine_similarity(query, top_k=3)

for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"  Prompt: {result['prompt']}")
    print(f"  Essay: {result['essay']}")
    print(f"  Label: {result['label']}")
    print(f"  Similarity: {result['similarity']}")
    print()

Result 1:
  Prompt: Write about the following topic.Some people prefer to spend their lives doing the same things and avoiding change. Others, however, think that change is always a good thing.Discuss both these views and give your own opinion.Give reasons for your answer and include any relevant examples from your own knowledge or experience.
  Essay: Some people prefers living in a stable condidtion. They were doing almost the same routine every day, going to the same bookshop or gorceries on weekend and same things to do at work. There are not much changes in their lives. Others like to living in the changing environment. They keep trying new things, go to the different restrauants or meeting new friends. Sometimes they even challenging themselves by moving to other positions at work.
I agree with both views because these two views are not conflict. One can meeting new friends in the same coffee store or can meeting old friends in different coffee sotres. The change and unchange onl

In [15]:
# check for unformity 
df_topic = pd.read_json("./embeddings_dataset_train_topics.json")
df_essay = pd.read_json("./embeddings_dataset_train_essay.json")

print(df_topic.equals(df_essay))

True


#### check for startification and distributions

In [23]:
label_map = {
    0: '<4',
    1: 4.0,
    2: 4.5,
    3: 5.0,
    4: 5.5,
    5: 6.0,
    6: 6.5,
    7: 7.0,
    8: 7.5,
    9: 8.0,
    10: 8.5,
    11: 9.0
}

df= pd.read_json("./embeddings_dataset_test_topics.json")
# print(df)

class_distribution = df["label"].value_counts(normalize=True)
print(f"the distribution of classes: {class_distribution}")

prompt_list = df["prompt"]
essay_list = df["essay"]
label_list = df["label"]
grade_list = []

for i in label_list:
    grade_list.append(label_map[i])

export_dict = dict()
export_dict["prompt"] = prompt_list
export_dict["essay"] = essay_list
export_dict["label"] = label_list
export_dict["grade"] = grade_list

df_export = pd.DataFrame(export_dict)

output_path = "./embeddings_dataset_test_topic.csv"
df_export.to_csv(output_path, index=False, encoding='utf-8', mode='w')

the distribution of classes: label
7     0.142272
5     0.119754
6     0.119754
8     0.112590
3     0.103378
4     0.097236
9     0.076766
2     0.062436
1     0.055271
0     0.053224
10    0.042989
11    0.014330
Name: proportion, dtype: float64


In [31]:
path = "./selected_embeddings_dataset_test.csv"
output_path = "./manual_embedding_dataset_test.json"
df = pd.read_csv(path)
print(f"distribution of mannually selected samples: {df['label'].value_counts(normalize=True)}")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.to_json(output_path, orient="records", indent=4)


distribution of mannually selected samples: label
7     0.142857
6     0.122449
5     0.122449
8     0.112245
4     0.102041
3     0.102041
9     0.071429
1     0.061224
2     0.061224
0     0.051020
10    0.040816
11    0.010204
Name: proportion, dtype: float64
