In [1]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [2]:
import pandas as pd
import openai
from tqdm import tqdm

In [3]:
df = pd.read_csv("nodes_with_examples_final.csv")

In [4]:
len(df)

1190

In [5]:
df.head()

Unnamed: 0,word,romaji,explanation,properties,examples
0,パチパチ,pachipachi,"crackling, cracking, snapping, popping, clicki...",popular; onomatopoeic or mimetic word; adverb;...,囲炉裏の側にいるとぱちぱちという音が聞こえてくる。; 日本には口にいれるとパチパチとはじける...
1,ハラハラ,harahara,"to feel anxious, to feel nervous, to be kept i...",popular; onomatopoeic or mimetic word; noun or...,偉い人が集まるパーティで自分が失礼なことしないかどうかハラハラする。; 紅葉の季節が終わって...
2,あっさり,assari,"easily, readily, quickly, flatly (refuse); lig...",popular; JLPT N1; onomatopoeic or mimetic word...,このラーメンはすごくあっさりした味。; 白大豆よりこの黒大豆で作られた味噌汁はあっさりしてい...
3,あべこべ,abekobe,"contrary, opposite, inverse, reverse, back-to-...",popular; JLPT N1; onomatopoeic or mimetic word...,
4,あやふや,ayafuya,"uncertain, vague, ambiguous",popular; JLPT N1; onomatopoeic or mimetic word...,


In [None]:
GPT_APIKEY = input("Your GPT-3 API Key: ")
openai.api_key = GPT_APIKEY
EMBEDDING_MODEL = "text-embedding-ada-002"

In [7]:
def get_embedding(text, model=EMBEDDING_MODEL):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [8]:
df["embedding"] = df["explanation"].apply(lambda x: get_embedding(x))

In [9]:
df.to_csv("with_embedding.csv")

In [10]:
from openai.embeddings_utils import cosine_similarity

def search_synonyms(ddf, index, cutoff=0.9):
    query_embedding = ddf.iloc[index]["embedding"]
    res = ddf["embedding"].apply(lambda x: cosine_similarity(x, query_embedding)).reset_index(name="similarity")
    
    return res[res["similarity"] > cutoff]

In [11]:
nodes = []
edges = set()


for i, row in tqdm(df.iterrows()):
    
    nodes.append([i, row["word"], row["romaji"], row["explanation"], row["properties"], row["examples"]])


    result_index = search_synonyms(df, i)
    for j, r in result_index.iterrows():
        if i != r["index"]:
            min_tuple = int(min(i, r["index"]))
            max_tuple = int(max(i, r["index"]))
            edges.add(f"{min_tuple},{max_tuple},{round(r['similarity'], 5)}")

1190it [13:40,  1.45it/s]


In [12]:
node_df = pd.DataFrame(nodes, columns =['index', 'word', 'romaji', 'explanation', 'properties', 'examples'])

node_df.to_csv("nodes.csv", sep=",", index = False)

In [13]:
edges_content = ",".join(["from", "to", "similarity"]) + "\n"

for e in edges:
    edges_content += ",".join(e.split(",")) + "\n"

with open("edges.csv", "w") as output:
    output.write(edges_content)

In [15]:
node_df["examples"].isna()

0       False
1       False
2       False
3        True
4        True
        ...  
1185     True
1186     True
1187     True
1188     True
1189     True
Name: examples, Length: 1190, dtype: bool