In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m25.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0


In [4]:
!pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


In [5]:
# imports
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding

In [8]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

from google.colab import drive
drive.mount('/content/drive')

#!kaggle datasets list

!kaggle datasets download -d snap/amazon-fine-food-reviews


mkdir: cannot create directory ‘/root/.kaggle’: File exists
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloading amazon-fine-food-reviews.zip to /content
 95% 229M/242M [00:02<00:00, 102MB/s]
100% 242M/242M [00:02<00:00, 92.2MB/s]


In [9]:
# https://www.kaggle.com/snap/amazon-fine-food-reviews
!unzip /content/amazon-fine-food-reviews.zip

Archive:  /content/amazon-fine-food-reviews.zip
  inflating: Reviews.csv             
  inflating: database.sqlite         
  inflating: hashes.txt              


In [10]:
# load & inspect dataset
input_datapath = "/content/Reviews.csv" #"data/fine_food_reviews_1k.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [11]:
df.shape

(568427, 7)

In [12]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 50
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

50

In [23]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [25]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/openapi.txt')
secret_key = config['global']['OPENAI_API_KEY']

In [26]:
import openai, os
openai.api_key  = secret_key

In [28]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("fine_food_reviews_with_embeddings_1k.csv")

In [29]:
df

Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
346131,B004TJF3BE,A2TZKSY1ZWPOU9,5,Great Hot Cider!!!,It is hard to find much of anything sugarfree ...,Title: Great Hot Cider!!!; Content: It is hard...,46,"[0.0007878253818489611, -0.004287755116820335,..."
135891,B001ACMCLM,A2PCNXBSKCABG5,4,GOOD GLUTEN FREE BREAD STCK MIX,Makes very good break sticks.. Also can be use...,Title: GOOD GLUTEN FREE BREAD STCK MIX; Conten...,52,"[-0.0012241910444572568, -0.009551214054226875..."
182238,B004LM9KHW,A1AOOCCQ27K9IT,3,French Vanilla Wolfgang Puck,Product is easy to use.... Just cut or tear pa...,Title: French Vanilla Wolfgang Puck; Content: ...,123,"[0.004020157735794783, -0.014732792973518372, ..."
354603,B000LKU3A6,A2YRK0YLBN5CC2,3,"Good flavor, but a wet mess","I got the teriyaki flavor and, while the flavo...","Title: Good flavor, but a wet mess; Content: I...",267,"[-0.017305126413702965, 0.0020150248892605305,..."
320388,B008JA73RG,AFJFXN42RZ3G2,4,Neither too sweet nor fizzy,V8 V-Fusion may appear to be the typical energ...,Title: Neither too sweet nor fizzy; Content: V...,230,"[-0.0018178685568273067, -0.031288571655750275..."
486553,B000MUT928,AMV75AVRSNM0L,3,Crunchy strong and ok taste,"I thought the pocket coffee was good, not sure...",Title: Crunchy strong and ok taste; Content: I...,71,"[0.004615475423634052, -0.028094196692109108, ..."
355352,B0007PNKRS,A1TED4G0PWZPQV,5,Came as expected,It was tasty and fresh. The other one I bought...,Title: Came as expected; Content: It was tasty...,32,"[0.016167430207133293, -0.0235698614269495, 0...."
402156,B0006349WQ,A21BT40VZCCYT4,5,Good Training Treat,My dog will come in from outside when I am tra...,Title: Good Training Treat; Content: My dog wi...,48,"[-0.024458859115839005, -0.017603976652026176,..."
131484,B001ANXL84,A3NZ74QTATJ45W,5,Best electrolyte replacement drink,I'm a disabled Vet with 80% of my kidneys gone...,Title: Best electrolyte replacement drink; Con...,228,"[-0.0006665267865173519, 0.01338726095855236, ..."
519038,B008TZJUOA,A1LHOKYENR7HP2,5,Cute!,Bought these to decorate cupcakes for a kid's ...,Title: Cute!; Content: Bought these to decorat...,32,"[-0.01840221881866455, 0.0013174761552363634, ..."
