In [1]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.35.13-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 h

In [2]:
from openai import OpenAI
import os

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [3]:
client = OpenAI(api_key="")

In [4]:
text_to_embed = "문장의 의미를 가지고 임베딩합니다."
embedding = client.embeddings.create(
    input=text_to_embed, model="text-embedding-3-small"
).data[0].embedding
print(len(embedding))
print(embedding[:5])

1536
[0.06261987239122391, 0.06903039664030075, 0.004282640293240547, 0.029844114556908607, 0.007974065840244293]


# 임베딩을 사용한 검색

In [5]:
def get_embedding(text):
    return client.embeddings.create(input=text, model="text-embedding-3-small").data[0].embedding


In [6]:
!wget https://raw.githubusercontent.com/dhrim/2024_kangnam_hallym_workshop/main/material/data/fine_food_reviews_with_embeddings_1k.csv -O fine_food_reviews_with_embeddings_1k.csv

--2024-07-11 06:45:58--  https://raw.githubusercontent.com/dhrim/2024_kangnam_hallym_workshop/main/material/data/fine_food_reviews_with_embeddings_1k.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35046315 (33M) [text/plain]
Saving to: ‘fine_food_reviews_with_embeddings_1k.csv’


2024-07-11 06:45:59 (188 MB/s) - ‘fine_food_reviews_with_embeddings_1k.csv’ saved [35046315/35046315]



In [7]:
from ast import literal_eval

datafile_path = "fine_food_reviews_with_embeddings_1k.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity


# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = np.array(get_embedding(product_description))
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x.reshape(1, -1), product_embedding.reshape(1, -1)))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for r in results:
            print(r[:200])
            print()
    return results


results = search_reviews(df, "delicious beans", n=3)

Delicious!:  I enjoy this white beans seasoning, it gives a rich flavor to the beans I just love it, my mother in law didn't know about this Zatarain's brand and now she is traying different seasoning

Fantastic Instant Refried beans:  Fantastic Instant Refried Beans have been a staple for my family now for nearly 20 years.  All 7 of us love it and my grown kids are passing on the tradition.

Delicious:  While there may be better coffee beans available, this is my first purchase and my first time grinding my own beans.  I read several reviews before purchasing this brand, and am extremely 



In [9]:
results = search_reviews(df, "pet food", n=3)

Great food!:  I wanted a food for a a dog with skin problems. His skin greatly improved with the switch, though he still itches some.  He loves the food. No recalls, American made with American ingred

Great food!:  I wanted a food for a a dog with skin problems. His skin greatly improved with the switch, though he still itches some.  He loves the food. No recalls, American made with American ingred

Good food:  The only dry food my queen cat will eat. Helps prevent hair balls. Good packaging. Arrives promptly. Recommended by a friend who sells pet food.



In [10]:
results = search_reviews(df, "강아지 먹이", n=3)

Great food!:  I wanted a food for a a dog with skin problems. His skin greatly improved with the switch, though he still itches some.  He loves the food. No recalls, American made with American ingred

Great food!:  I wanted a food for a a dog with skin problems. His skin greatly improved with the switch, though he still itches some.  He loves the food. No recalls, American made with American ingred

Great for small breeds with sensitive systems:  I have two small mixed breed dogs each under 10 pounds. This is to only food I have found that they can eat with out any problems.



In [11]:
results = search_reviews(df, "강아지 밥", n=3)

chow time:  a fine delicacy, with a pleasant smell and taste  that my dogs look forward to. Packaging is easy to store and dispose

Yummy Chummies:  All my dogs love them. Healthy treats. Not great for training as they crumble when you try to break them into pieces

TOXIC INGREDIENTS!  WILL RETURN....:  After reviewing the 3-star review on Amazon, I checked the label on the container. Sure enough, this product contains propylene glycol!!!! as one of the top ingre



In [12]:
results = search_reviews(df, "애완동물 음식", n=3)

Yummy Chummies:  All my dogs love them. Healthy treats. Not great for training as they crumble when you try to break them into pieces

Woody and Arlo's favorites!:  My two dogs LOVE these!  I cut each strip into bite-sized six pieces.  I like the fact that they are quite a healthy treat and low fat.<br /><a href="http://www.amazon.co

fine for 13 y.o. doggy:  My Am Staff is about 13 years old and is a very "food-driven" dog. He seems to like this version of Eukanuba just fine, although he is not a picky eater at all! The kibble see



# 임베딩을 사용한 분류

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [14]:
# split data into train and test
x_train, x_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)

# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)

report = classification_report(y_test, preds)
print(report)

              precision    recall  f1-score   support

           1       1.00      0.50      0.67        20
           2       1.00      0.38      0.55         8
           3       1.00      0.18      0.31        11
           4       1.00      0.26      0.41        27
           5       0.75      1.00      0.86       134

    accuracy                           0.78       200
   macro avg       0.95      0.46      0.56       200
weighted avg       0.83      0.78      0.74       200

