# Personalised Movie Reviews

In [67]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set up Azure OpenAI

In [68]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai.api_type = "azure"
openai.api_version = "2023-03-15-preview"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_key = os.getenv("OPENAI_API_KEY")

True

### 모델 배포

In [69]:
# id of desired_model
query_model = "text-embedding-ada-002"

# list models deployed
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    if model["id"] == query_model:
        deployment_id = deployment["id"]
        
# if not model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')

    # Now let's create the deployment
    print(f'Creating a new deployment with model: {query_model}')
    result = openai.Deployment.create(model=query_model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {query_model} with deployment_id {deployment_id}')
else:
    print(f'Found a succeeded deployment of "{query_model}" that supports text search with id: {deployment_id}.')

Found a succeeded deployment of "text-embedding-ada-002" that supports text search with id: text-embedding-ada-002.


### 데이터 로드

In [78]:
import pandas as pd
import numpy as np

fname = '../data/rottentomatoes-20movies-embeddings.csv'
df = pd.read_csv(fname, delimiter='\t', index_col=False)

# 결측치 제거 
df.dropna(inplace=True)

# 문자열을 numpy 배열로 변환
df["embedding"] = df['embedding'].apply(eval).apply(np.array)
df.head()
df.shape

Unnamed: 0,Movie,Publish,Review,Date,Score,Word_Count,embedding
0,SOLO: A STAR WARS STORY,Stuff.co.nz,The formula is strong with this one.,2018-05-24,70.0,7,"[0.01451321691274643, -0.014196277596056461, -..."
1,BLACK PANTHER,Gone With The Twins,Just about the same as every other Marvel title.,2020-05-12,50.0,9,"[-0.00993192009627819, -0.04493372514843941, -..."
2,DUNKIRK,Screen Zealots,This is one heck of a stunning war picture.,2018-12-20,80.0,9,"[-0.021182995289564133, -0.009731178171932697,..."
3,KNIVES OUT,Student Edge,Don't fear: No spoilers here. All you need to ...,2019-11-26,80.0,17,"[-0.001303257653489709, -0.016508987173438072,..."
4,KNIVES OUT,Deep Focus Review,"Sharp and funny, Knives Out exceeds expectatio...",2022-02-23,100.0,29,"[0.006400220561772585, -0.02247047796845436, -..."


(6640, 7)

In [93]:
df['Movie'].value_counts()

Movie
JOKER                               380
CAPTAIN MARVEL                      373
ONCE UPON A TIME... IN HOLLYWOOD    372
AVENGERS: ENDGAME                   370
US                                  358
STAR WARS: THE RISE OF SKYWALKER    351
A STAR IS BORN                      340
BLACK PANTHER                       339
AVENGERS: INFINITY WAR              329
SOLO: A STAR WARS STORY             321
STAR WARS: THE LAST JEDI            320
SPIDER-MAN: FAR FROM HOME           319
DUNKIRK                             316
KNIVES OUT                          311
TOY STORY 4                         309
READY PLAYER ONE                    308
WONDER WOMAN                        307
1917                                306
FIRST MAN                           306
ROGUE ONE: A STAR WARS STORY        305
Name: count, dtype: int64

### Count Token

In [None]:
import tiktoken
encoding = tiktoken.get_encoding('p50k_base')

df['token_count'] = ''

for idx, movie, review in zip(df.index.values, df['Movie'].loc[df.index.values], df['Review'].loc[df.index.values]):
    df['token_count'].loc[idx] = len(encoding.encode(review))

df.head()

### 프롬프트 구성

In [81]:
# 쿼리 임베딩 값 반환 
def get_embedding(text, deployment_id=deployment_id):
    """ 
    Get embeddings for an input text. 
    """
    result = openai.Embedding.create(
      deployment_id=deployment_id,
      input=text
    )
    result = np.array(result["data"][0]["embedding"])
    return result

# 두 벡터간 유사도 계산 
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    similarity = np.dot(x, y)
    return similarity 

# 쿼리와 문서 섹션 간의 유사도를 계산하여, 유사도가 높은 순으로 문서 섹션을 정렬하여 반환
def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [82]:
MAX_SECTION_LEN = 500 # 섹션 최대 토큰
SEPARATOR = "\n* " # 섹션 구분 문자열
ENCODING = "p50k_base"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

In [83]:
# 주어진 쿼리에 대해 가장 관련성이 높은 섹션들을 찾아 쿼리와 함께 프롬프트를 구성하는 함수
def construct_prompt(query: str, context_embeddings: pd.DataFrame, df: pd.DataFrame) -> str:
    """
    Append sections of document that are most similar to the query.
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(query, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section['token_count'] + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + 
                               'movie title: ' + document_section['Movie'] + ' ' +
                               document_section['Review'].replace("\n", " "))
        
        chosen_sections_indexes.append(str(section_index))
            
    # Diagnostic information
    print(f"Selected {len(chosen_sections)} document sections, with indexes:")    
    for i in chosen_sections_indexes:
        print(i + " " + df['Movie'].loc[int(i)])
    
    header = """Answer the question truthfully using context, if unsure, say "I don't know."\n\nContext:\n"""
    prompt = header + "".join(chosen_sections) + "\n\n Q: " + query + "\n A:"
    
    return prompt

### 프롬프트 예시 

In [84]:
query = 'Summarise reviews of Captain Marvel.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)

Selected 18 document sections, with indexes:
3054 CAPTAIN MARVEL
2839 CAPTAIN MARVEL
2845 CAPTAIN MARVEL
3067 CAPTAIN MARVEL
2735 CAPTAIN MARVEL
2745 CAPTAIN MARVEL
3038 CAPTAIN MARVEL
3061 CAPTAIN MARVEL
2777 CAPTAIN MARVEL
2725 CAPTAIN MARVEL
2814 CAPTAIN MARVEL
3065 CAPTAIN MARVEL
2895 CAPTAIN MARVEL
3082 CAPTAIN MARVEL
3073 CAPTAIN MARVEL
2778 CAPTAIN MARVEL
2887 CAPTAIN MARVEL
2898 CAPTAIN MARVEL
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: CAPTAIN MARVEL In short, Captain Marvel is a blast.
* movie title: CAPTAIN MARVEL Captain Marvel does the job it was meant to do, and I can understand how some people might like it more than others.
* movie title: CAPTAIN MARVEL Arrives in theaters as an adequate superhero film, complete with many of the strengths and weaknesses of the genre.
* movie title: CAPTAIN MARVEL Captain Marvel is enjoyable enough as popcorn entertainment.
* movie title: CAPTAIN MARVEL Captain Marvel is a good f

In [85]:
def retrieve_information(prompt):
    try:
        # Request API
        response = openai.Completion.create(
            deployment_id= "text-davinci-003", # Assumed already deployed
            prompt=prompt,
            temperature=1,
            max_tokens=3000,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=1
        )

        # response
        result = response['choices'][0]['text']; print(result)
    except Exception as err:
        print(idx)
        print(f"Unexpected {err=}, {type(err)=}")

    return 

### Example Queries

In [91]:
query = 'Summarise reviews of Captain Marvel.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 18 document sections, with indexes:
3054 CAPTAIN MARVEL
2839 CAPTAIN MARVEL
2845 CAPTAIN MARVEL
3067 CAPTAIN MARVEL
2735 CAPTAIN MARVEL
2745 CAPTAIN MARVEL
3038 CAPTAIN MARVEL
3061 CAPTAIN MARVEL
2777 CAPTAIN MARVEL
2725 CAPTAIN MARVEL
2814 CAPTAIN MARVEL
3065 CAPTAIN MARVEL
2895 CAPTAIN MARVEL
3082 CAPTAIN MARVEL
3073 CAPTAIN MARVEL
2778 CAPTAIN MARVEL
2887 CAPTAIN MARVEL
2898 CAPTAIN MARVEL
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: CAPTAIN MARVEL In short, Captain Marvel is a blast.
* movie title: CAPTAIN MARVEL Captain Marvel does the job it was meant to do, and I can understand how some people might like it more than others.
* movie title: CAPTAIN MARVEL Arrives in theaters as an adequate superhero film, complete with many of the strengths and weaknesses of the genre.
* movie title: CAPTAIN MARVEL Captain Marvel is enjoyable enough as popcorn entertainment.
* movie title: CAPTAIN MARVEL Captain Marvel is a good f

In [88]:
query = 'Should I watch Ready Player One?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 14 document sections, with indexes:
4971 READY PLAYER ONE
4808 READY PLAYER ONE
4967 READY PLAYER ONE
4753 READY PLAYER ONE
4932 READY PLAYER ONE
5035 READY PLAYER ONE
4858 READY PLAYER ONE
4755 READY PLAYER ONE
4959 READY PLAYER ONE
4856 READY PLAYER ONE
4775 READY PLAYER ONE
4779 READY PLAYER ONE
4934 READY PLAYER ONE
4789 READY PLAYER ONE
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: READY PLAYER ONE Ready Player One is enjoyably diverting and speaks to the nerd in all of us in some shape or form.
* movie title: READY PLAYER ONE "Ready Player One" is an engrossing thrill ride that feeds off of your pop culture nostalgia.
* movie title: READY PLAYER ONE Ready Player One should make fans of 80s movies, music and video games geek out, but it - and the convoluted way the story's unveiled - might alienate others.
* movie title: READY PLAYER ONE If you go to the movies to watch big blockbusters with big explosions and effec

In [89]:
query = 'Why shouldn\'t I watch spiderman? I am big fan of visual effects.'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 12 document sections, with indexes:
2309 SPIDER-MAN: FAR FROM HOME
2213 SPIDER-MAN: FAR FROM HOME
2101 SPIDER-MAN: FAR FROM HOME
2090 SPIDER-MAN: FAR FROM HOME
2066 SPIDER-MAN: FAR FROM HOME
2156 SPIDER-MAN: FAR FROM HOME
2042 SPIDER-MAN: FAR FROM HOME
2207 SPIDER-MAN: FAR FROM HOME
4963 READY PLAYER ONE
2293 SPIDER-MAN: FAR FROM HOME
2054 SPIDER-MAN: FAR FROM HOME
2232 SPIDER-MAN: FAR FROM HOME
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: SPIDER-MAN: FAR FROM HOME The story is pyrotechnical. The soundtrack is multi-decibel. There is no room for wit, thought, emotion or seriously challenging novelty. But, simultaneously, I'd rather watch Holland do this rubbish than most movie actors.
* movie title: SPIDER-MAN: FAR FROM HOME There's not a beat in this film that doesn't land and isn't flawlessly executed, the climatic fight sequence is something that I never thought I'd see on the big screen and it was a glorious thing t

In [51]:
query = 'I am not a big fan of lengthy movie, should I watch 1917?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 17 document sections, with indexes:
879 1917
933 1917
753 1917
821 1917
759 1917
786 1917
774 1917
798 1917
778 1917
817 1917
674 1917
959 1917
895 1917
826 1917
775 1917
719 1917
905 1917
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: 1917 Sitting through it is like watching someone else playing a video game for two solid hours, and not an especially compelling one at that.
* movie title: 1917 No, the long takes don't transform "1917" into the cinema event of 2019.
* movie title: 1917 Considering the subject matter, I was left extremely impressed but not particularly moved.
* movie title: 1917 This is a movie one does not watch so much as witness. It simply must be seen.
* movie title: 1917 In other words, "1917" often seems built more to wow audience than make them feel. And it may well have been a better film set around extended cuts than fully committing to the one-take gimmick.
* movie title: 1917 With the stakes bei

In [57]:
query = 'I love visual effects, should I watch Captain Marvel or TOY STORY?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 16 document sections, with indexes:
2774 CAPTAIN MARVEL
2823 CAPTAIN MARVEL
1931 TOY STORY 4
2887 CAPTAIN MARVEL
3086 CAPTAIN MARVEL
1744 TOY STORY 4
1833 TOY STORY 4
1793 TOY STORY 4
1758 TOY STORY 4
2895 CAPTAIN MARVEL
3054 CAPTAIN MARVEL
3038 CAPTAIN MARVEL
1842 TOY STORY 4
3003 CAPTAIN MARVEL
2841 CAPTAIN MARVEL
1805 TOY STORY 4
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: CAPTAIN MARVEL Captain Marvel gives everything a fan girl or boy could want - spectacle, action, heart, and inspiration.
* movie title: CAPTAIN MARVEL Great visual effects and acting by Brie Larson make for an enjoyable watch that embraces a confident and smart woman character, something rarely seen in this genre.
* movie title: TOY STORY 4 The visuals are so impressive that I reached the point where I wasn't sure whether Cooley and his army of Pixar technicians inserted "real" elements like rain and grass into a digitally animated landscape, like

In [53]:
query = 'I love emotional movies, what movie should I watch?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 15 document sections, with indexes:
5719 DUNKIRK
1817 TOY STORY 4
5843 DUNKIRK
2369 AVENGERS: ENDGAME
2003 TOY STORY 4
3106 US
2998 CAPTAIN MARVEL
3347 US
2463 AVENGERS: ENDGAME
1977 TOY STORY 4
5657 STAR WARS: THE LAST JEDI
1917 TOY STORY 4
2665 AVENGERS: ENDGAME
4762 READY PLAYER ONE
5652 STAR WARS: THE LAST JEDI
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: DUNKIRK It's an emotional gauntlet, as you'll be glued to the edge of your seats with your eyes staring at the screening.
* movie title: TOY STORY 4 This might be just the most emotional experience you'll have in a cinema all year.
* movie title: DUNKIRK Dunkirk brings a lump to the throat and a tear to the eye. Highly recommended.
* movie title: AVENGERS: ENDGAME It's one of the funniest movies of the 20-plus in the franchise, but it's also the one that made me the most emotional. But more than anything, it left me satisfied and thrilled that I kept watching them 

In [54]:
query = 'Is Joker a scary movie?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 14 document sections, with indexes:
1110 JOKER
1245 JOKER
1164 JOKER
1028 JOKER
1349 JOKER
1019 JOKER
1295 JOKER
1269 JOKER
1336 JOKER
1038 JOKER
1256 JOKER
1331 JOKER
1314 JOKER
1151 JOKER
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: JOKER Intense and disturbing, Joker is a well-crafted movie, if rather open-ended in its intentions.
* movie title: JOKER The Joker is deadly serious, a bleak but oddly beautiful horror film that evokes the nightmarish nihilism of Martin Scorsese's Taxi Driver.
* movie title: JOKER Joker is a devastatingly bold, brashly aggressive, truly haunting homage to the clown prince of crime. It stands proud as one of the most troubling, worrisome and delightfully distressing pieces of comic book cinema.
* movie title: JOKER Joker is a wicked trip.
* movie title: JOKER Joker is wild, crazy, and intense, and I was left speechless by the end of the film. Joaquin Phoenix delivers a spine-chilling perfo

In [55]:
query = 'What type of movie is Joker?'
prompt = construct_prompt(query=query, context_embeddings=df['embedding'], df=df); print(prompt)
retrieve_information(prompt=prompt)

Selected 12 document sections, with indexes:
1314 JOKER
1110 JOKER
1336 JOKER
1346 JOKER
1028 JOKER
1242 JOKER
1347 JOKER
1164 JOKER
1227 JOKER
1245 JOKER
1169 JOKER
1299 JOKER
Answer the question truthfully using context, if unsure, say "I don't know."

Context:

* movie title: JOKER A violent, nihilistic horror film masquerading as both a character drama and a comic book movie
* movie title: JOKER Intense and disturbing, Joker is a well-crafted movie, if rather open-ended in its intentions.
* movie title: JOKER Whether Joker is a social commentary on issues such as poverty or mental illness, a new mysterious take on the best known DC Comics villain, or another unforgettable piece of cinema by Martin Scorsese, you'll need to see to believe it.
* movie title: JOKER More character study than comic book movie, and anchored by an Oscar-worthy Joaquin Phoenix, Joker is a bravura blockbuster that proves you don't need superpowered scraps to dazzle.
* movie title: JOKER Joker is a wicked tri