# Get Embeddings

In [2]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up Azure OpenAI

In [6]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
openai.api_type = "azure"
openai.api_base = os.getenv("SOUTH_CENTRAL_US_OPENAI_API_BASE")
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("SOUTH_CENTRAL_US_OPENAI_API_KEY")

## Load Data

In [4]:
import pandas as pd

df_orig = pd.read_csv("../data/rottentomatoes-20movies-wordcount.csv", sep='\t')
df = df_orig.copy()
df

Unnamed: 0,Movie,Publish,Review,Date,Score,Word_Count
0,SOLO: A STAR WARS STORY,Stuff.co.nz,The formula is strong with this one.,2018-05-24,70.0,7
1,BLACK PANTHER,Gone With The Twins,Just about the same as every other Marvel title.,2020-05-12,50.0,9
2,DUNKIRK,Screen Zealots,This is one heck of a stunning war picture.,2018-12-20,80.0,9
3,KNIVES OUT,Student Edge,Don't fear: No spoilers here. All you need to ...,2019-11-26,80.0,17
4,KNIVES OUT,Deep Focus Review,"Sharp and funny, Knives Out exceeds expectatio...",2022-02-23,100.0,29
...,...,...,...,...,...,...
6635,ROGUE ONE: A STAR WARS STORY,Movie Nation,This is more like it...the 'Star Wars' movie J...,2016-12-13,75.0,13
6636,ROGUE ONE: A STAR WARS STORY,Newsday,"This ""Star Wars"" spinoff doesn't spin very far...",2016-12-13,75.0,19
6637,ROGUE ONE: A STAR WARS STORY,Metro,Boasts thin characters played by great actors ...,2016-12-13,40.0,37
6638,ROGUE ONE: A STAR WARS STORY,Den of Geek,Rogue One builds to one of the best third acts...,2016-12-13,80.0,14


## Deploy a model
ref: 
- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models
- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models#text-search-embedding


In [7]:
# id of desired_model
desired_model = "text-search-davinci-doc-001" # suitable for Search, context relevance, information retrieval

# list models deployed with embeddings capability
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    if model["id"] == desired_model:
        deployment_id = deployment["id"]
        
# if not model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')

    # Now let's create the deployment
    print(f'Creating a new deployment with model: {desired_model}')
    result = openai.Deployment.create(model=desired_model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {desired_model} with deployment_id {deployment_id}')
else:
    print(f'Found a succeeded deployment of "{desired_model}" that supports text search with id: {deployment_id}.')

No deployment with status: succeeded found.
Creating a new deployment with model: text-search-davinci-doc-001
Successfully created text-search-davinci-doc-001 with deployment_id deployment-0e7bf012a67d441f93a281114bd5ed31


## Get Embeddings
ref: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings?tabs=bash

In [8]:
# 첫 행으로 임베딩 테스트 
input = 'Movie title: ' + df['Movie'][0] + '\n' + df['Review'][0]
input

embedding = openai.Embedding.create(
    input=input,
    deployment_id=deployment_id)

# embedding
len(embedding["data"][0]["embedding"])

'Movie title: SOLO: A STAR WARS STORY\nThe formula is strong with this one.'

12288

In [10]:
from ratelimiter import RateLimiter

@RateLimiter(max_calls=50, period=60) # OpenAI API의 요청 제한을 고려하여 60초 동안 50개의 요청만 허용하도록 설정 
def request_api(df, deployment_id, i):
    try:
        input = 'Movie title: ' + df['Movie'][i] + '\n' + df['Review'][i]
        embedding = openai.Embedding.create(input=input, deployment_id=deployment_id)
        df['embedding'].iloc[i] = embedding['data'][0]['embedding']
    except Exception as err:
        print(i)
        print(f"Unexpected {err=}, {type(err)=}")

In [11]:
df['embedding'] = ''

for i in range(len(df)): # This takes over 133 minutes.
#for i in range(0,2):
    request_api(df, deployment_id, i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [12]:
df

[autoreload of requests.exceptions failed: Traceback (most recent call last):
  File "c:\Users\eunjee\anaconda3\lib\site-packages\IPython\extensions\autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\eunjee\anaconda3\lib\site-packages\IPython\extensions\autoreload.py", line 394, in superreload
    module = reload(module)
  File "c:\Users\eunjee\anaconda3\lib\imp.py", line 314, in reload
    return importlib.reload(module)
  File "c:\Users\eunjee\anaconda3\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "c:\Users\eunjee\anaconda3\lib\site-packages\requests\exceptions.py", line 9, in <module>
    from .compat import JSONDecodeError as CompatJSONDecodeError
ImportError: cannot import name 'JSON

Unnamed: 0,Movie,Publish,Review,Date,Score,Word_Count,embedding
0,SOLO: A STAR WARS STORY,Stuff.co.nz,The formula is strong with this one.,2018-05-24,70.0,7,"[-0.013439337722957134, 0.006897337269037962, ..."
1,BLACK PANTHER,Gone With The Twins,Just about the same as every other Marvel title.,2020-05-12,50.0,9,"[-0.006859001703560352, 0.0037438718136399984,..."
2,DUNKIRK,Screen Zealots,This is one heck of a stunning war picture.,2018-12-20,80.0,9,"[-0.003785009030252695, 0.004640915431082249, ..."
3,KNIVES OUT,Student Edge,Don't fear: No spoilers here. All you need to ...,2019-11-26,80.0,17,"[0.0009526872891001403, 0.016423344612121582, ..."
4,KNIVES OUT,Deep Focus Review,"Sharp and funny, Knives Out exceeds expectatio...",2022-02-23,100.0,29,"[-0.005653353873640299, 0.010235012508928776, ..."
...,...,...,...,...,...,...,...
6635,ROGUE ONE: A STAR WARS STORY,Movie Nation,This is more like it...the 'Star Wars' movie J...,2016-12-13,75.0,13,"[-0.012857149355113506, 0.012755712494254112, ..."
6636,ROGUE ONE: A STAR WARS STORY,Newsday,"This ""Star Wars"" spinoff doesn't spin very far...",2016-12-13,75.0,19,"[-0.013092534616589546, 0.0068380581215023994,..."
6637,ROGUE ONE: A STAR WARS STORY,Metro,Boasts thin characters played by great actors ...,2016-12-13,40.0,37,"[-0.005384239833801985, 0.011295164003968239, ..."
6638,ROGUE ONE: A STAR WARS STORY,Den of Geek,Rogue One builds to one of the best third acts...,2016-12-13,80.0,14,"[-0.018050068989396095, 0.007940508425235748, ..."


## Save data

In [13]:
df.to_csv("../data/rottentomatoes-20movies-embeddings.csv", sep='\t', index=False)