# Step #1 Imports and Connections

In [55]:
# preview version - work in progress

import openai
from azure.core.credentials import AzureKeyCredential
from azure.keyvault.secrets import SecretClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd
from azure.identity import AzureCliCredential
import pandas as pd
from azure.search.documents.indexes.models import (
        SearchIndex,
        SearchField,
        SearchFieldDataType,
        SimpleField,
        SearchableField,
        VectorSearch,
        VectorSearchAlgorithmConfiguration,
    )

from datasets import load_dataset

keyvaultname = 'your keyvault'

# keyvault authentication 
client = SecretClient(f"https://{keyvaultname}.vault.azure.net/", AzureCliCredential())

# This is set to `azure`
openai.api_type = "azure"

# The API key for your Azure OpenAI resource.
openai.api_key = client.get_secret('openai713eastus-api-key').value

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai.api_base = client.get_secret('openai713eastus-api-endpoint').value 

# Currently Chat Completion API have the following versions available: 2023-07-01-preview
openai.api_version = "2023-07-01-preview"

# The Azure Search endpoint for your Azure Search resource.
service_endpoint = client.get_secret('azure-search-api-endpoint').value 

# The name of your Azure Search index.
index_name = "mov-index"

# The API key for your Azure Search resource.
key = client.get_secret('azure-search-api-key').value  

# Step #2 Load the Data

In [38]:
# Required Libraries
import pandas as pd
import openai

# Load a sample news dataset from a CSV file
# Delimiter is set to '\t' assuming the CSV is tab-separated
df_orig = pd.read_csv("../data/movie_recommendations/imdb_top_1000.csv")
# https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows

# Make a copy of the original DataFrame for processing
content_df = df_orig.copy()

# Select the first 5 rows for demonstration or testing purposes
content_df = content_df.head(100)
content_df = content_df[['Series_Title', 'Overview', 'Genre', 'Director', 'IMDB_Rating']]

# Function to get embeddings for content using OpenAI's API
def get_embeddings(content):
    """
    Fetch embeddings for given content using OpenAI API.
    
    Parameters:
    - content (str): Text content for which embedding is to be fetched.
    
    Returns:
    - list: The embedding for the given content.
    """
    response = openai.Embedding.create(input=content, engine='text-embedding-ada-002')['data'][0]['embedding']
    return response

# Initialize a new column 'embedding' with empty strings
content_df['embedding'] = ''

# Iterate through each row in the DataFrame to get embeddings
for index, row in content_df.iterrows():
    try:
        # Fetch embedding for the content in the current row
        embedding = get_embeddings(row['Series_Title'] + row['Overview'])
        
        # Store the embedding in the 'embedding' column of the current row
        content_df.at[index, 'embedding'] = embedding
    except Exception as err:
        # Print the error details if there's an issue fetching the embedding
        print(f"Unexpected error: {err} of type {type(err)}")

# Display the processed DataFrame with embeddings
display(content_df)

Unnamed: 0,Series_Title,Overview,Genre,Director,IMDB_Rating,embedding
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...,Drama,Frank Darabont,9.3,"[0.005192748736590147, -0.038253773003816605, ..."
1,The Godfather,An organized crime dynasty's aging patriarch t...,"Crime, Drama",Francis Ford Coppola,9.2,"[0.0051362221129238605, -0.03805561363697052, ..."
2,The Dark Knight,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama",Christopher Nolan,9.0,"[-0.0033200737088918686, -0.04070454090833664,..."
3,The Godfather: Part II,The early life and career of Vito Corleone in ...,"Crime, Drama",Francis Ford Coppola,9.0,"[0.007154673337936401, -0.03457772359251976, 0..."
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...,"Crime, Drama",Sidney Lumet,9.0,"[-0.00941151287406683, -0.02089163102209568, -..."
...,...,...,...,...,...,...
95,Amélie,Amélie is an innocent and naive girl in Paris ...,"Comedy, Romance",Jean-Pierre Jeunet,8.3,"[-0.007555707823485136, -0.02741081640124321, ..."
96,Snatch,"Unscrupulous boxing promoters, violent bookmak...","Comedy, Crime",Guy Ritchie,8.3,"[-0.019652578979730606, -0.028809500858187675,..."
97,Requiem for a Dream,The drug-induced utopias of four Coney Island ...,Drama,Darren Aronofsky,8.3,"[0.00561836501583457, -0.0028779387939721346, ..."
98,American Beauty,A sexually frustrated suburban father has a mi...,Drama,Sam Mendes,8.3,"[-0.007404738571494818, -0.012670470401644707,..."


# Step #3 Fill the Database with Content

In [35]:
# Defining the vector in azure cognitive search

def get_news_index(name: str):
    # we are not only storing the vector but also additional fields 
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="director", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="rating", type=SearchFieldDataType.String, sortable=True, filterable=True),
        SearchableField(name="overview", type=SearchFieldDataType.String),
        SearchField(
            name="descriptionVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_configuration="my-vector-config",
        ),
        SearchableField(
            name="genre", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True
        ),
    ]
    vector_search = VectorSearch(
        algorithm_configurations=[VectorSearchAlgorithmConfiguration(name="my-vector-config", kind="hnsw")]
    )
    return SearchIndex(name=name, fields=fields, vector_search=vector_search)

credential = AzureKeyCredential(key)
index_client = SearchIndexClient(service_endpoint, credential)
# Check if the index exists and if not create it
try:
    index = get_news_index(index_name)
    index = index_client.get_index(index.name)
    display('Index already exists')
except Exception:
    index_client.create_index(index)
    display('Index created')

'Index already exists'

In [54]:
# Uploading the content to Azure Cognitive Search
def get_news_documents(df):
    
    docs = []

    for index, row in df.iterrows():
        docs.append({
            "id": str(index),
            "title": row['Series_Title'],
            "genre": row['Genre'],
            "rating": str(row['IMDB_Rating']),
            "director": row['Director'],
            "overview": row['Overview'],
            "descriptionVector": row['embedding']
        })
    return docs

docs = get_news_documents(content_df)
client = SearchClient(service_endpoint, index_name, credential)
client.upload_documents(documents=docs)


[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cfa60>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf940>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf7c0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cff40>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf040>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf580>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf850>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf280>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf910>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf520>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x1341b3cf2e0>,
 <azure.search.docume

# Step #5 Visuzalizing the data

In [47]:
# get all content from azure cognitive search
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

results = search_client.search(
    search_text=None,
    select=["id", "title", "overview", "genre", "descriptionVector", "director", "rating"]
)

df = pd.DataFrame(columns=['id', 'title', 'genre', 'overview', 'director', 'rating', 'embedding'])

data_list = []

for result in results:
    data_list.append({
        'title': result['title'],
        'id': result['id'],
        'score': result['@search.score'],
        'rating': result['rating'],
        'genre': result['genre'],
        'overview': result['overview'],
        'director': result['director'],
        'embedding': result['descriptionVector']
    })

df_data = pd.DataFrame(data_list)
df_data.head(5)

Unnamed: 0,title,id,score,rating,genre,overview,director,embedding
0,The Shawshank Redemption,0,1.0,9.3,Drama,Two imprisoned men bond over a number of years...,Frank Darabont,"[0.0051927487, -0.038253773, 0.012054708, -0.0..."
1,Soorarai Pottru,20,1.0,8.6,Drama,"Nedumaaran Rajangam ""Maara"" sets out to make t...",Sudha Kongara,"[-0.0044106543, -0.039114844, 0.0010093993, -0..."
2,Shichinin no samurai,31,1.0,8.6,"Action, Adventure, Drama",A poor village under attack by bandits recruit...,Akira Kurosawa,"[-0.012948995, -0.024345139, -0.02152177, 0.00..."
3,Nuovo Cinema Paradiso,45,1.0,8.5,"Drama, Romance",A filmmaker recalls his childhood when falling...,Giuseppe Tornatore,"[0.0069512404, -0.00813443, 0.0044048103, -0.0..."
4,Avengers: Infinity War,60,1.0,8.4,"Action, Adventure, Sci-Fi",The Avengers and their allies must be willing ...,Anthony Russo,"[-0.02783558, -0.04945764, 0.0030488227, -0.02..."


In [57]:
# Assuming df_data is your dataframe with 'embedding', 'title', 'genre', 'director', and 'overview' columns

# Extract embeddings and apply PCA
embeddings = pd.DataFrame(df_data['embedding'].tolist())
reduced_embeddings = PCA(n_components=2).fit_transform(embeddings)

# Add PCA results to the dataframe
df_data['x'] = reduced_embeddings[:, 0]
df_data['y'] = reduced_embeddings[:, 1]

# Create an interactive scatter plot using plotly
fig = px.scatter(df_data, x='x', y='y', hover_data=['title', 'genre', 'director', 'overview'], color='genre')

# Set titles and labels
fig.update_layout(title='Interactive Embeddings Plot', 
    xaxis_title='PCA 1', 
    yaxis_title='PCA 2', 
    xaxis=dict(scaleanchor="y", scaleratio=1),  # Ensure 1:1 aspect ratio
    yaxis=dict(scaleanchor="x", scaleratio=1),
    width=700,   # set the width of the plot in pixels
    height=600
)

# Show the plot
fig.show()

# Step #6 Serarch using Vector Search

In [59]:
# simple vector search
def single_vector_search(query):

    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    results = search_client.search(
        search_text=None,
        vector=get_embeddings(query),
        top=3,  
        vector_fields="descriptionVector",
        select=["id", "title", "overview", "genre"],
    )
    return results
        
query = "super hero"
results = single_vector_search(query)

print(results)
for result in results:  
    print(f"title: {result['title']}")  
    print(f"id: {result['id']}")
    print(f"Score: {result['@search.score']}")  
    print(f"genre: {result['genre']}") 
    print(f"overview: {result['overview']}") 

top_k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.Vector'> and will be ignored


<iterator object azure.core.paging.ItemPaged at 0x1341a2790d0>
title: Avengers: Infinity War
id: 60
Score: 0.8388787
genre: Action, Adventure, Sci-Fi
overview: The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.
title: The Dark Knight
id: 2
Score: 0.8345863
genre: Action, Crime, Drama
overview: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.
title: Spider-Man: Into the Spider-Verse
id: 58
Score: 0.83421373
genre: Animation, Action, Adventure
overview: Teen Miles Morales becomes the Spider-Man of his universe, and must join with five spider-powered individuals from other dimensions to stop a threat for all realities.


In [65]:
# simple vector search with filter
def single_vector_search_with_filter(query):

    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    results = search_client.search(
        search_text=None,
        vector=get_embeddings(query),
        top=3,  
        vector_fields="descriptionVector",
        filter="genre eq 'Drama'",
        select=["id", "title", "overview", "genre"],
    )
    return results
        
query = "animal hero"
results = single_vector_search_with_filter(query)

print(results)
for result in results:  
    print(f"title: {result['title']}")  
    print(f"id: {result['id']}")
    print(f"Score: {result['@search.score']}")  
    print(f"genre: {result['genre']}") 
    print(f"overview: {result['overview']}") 

top_k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.Vector'> and will be ignored


<iterator object azure.core.paging.ItemPaged at 0x134739c71c0>
title: Soorarai Pottru
id: 20
Score: 0.81495625
genre: Drama
overview: Nedumaaran Rajangam "Maara" sets out to make the common man fly and in the process takes on the world's most capital intensive industry and several enemies who stand in his way.
title: American History X
id: 40
Score: 0.8110039
genre: Drama
overview: A former neo-nazi skinhead tries to prevent his younger brother from going down the same wrong path that he did.
title: Jagten
id: 88
Score: 0.8106593
genre: Drama
overview: A teacher lives a lonely life, all the while struggling over his son's custody. His life slowly gets better as he finds love and receives good news from his son, but his new luck is about to be brutally shattered by an innocent little lie.


# Step #7 Search using Hybrid Search

In [68]:
# simple hybrid search without filter
def single_vector_search_with_filter(query):
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

    results = search_client.search(
        search_text=query,
        vector=get_embeddings(query),
        top=3,  
        vector_fields="descriptionVector",
        #filter="genre eq 'Drama'",
        select=["id", "title", "overview", "genre"],
    )
    return results

query = "money money money"
results = single_vector_search_with_filter(query)

print(results)
for result in results:  
    print(f"title: {result['title']}")  
    print(f"id: {result['id']}")
    print(f"Score: {result['@search.score']}")  
    print(f"genre: {result['genre']}") 
    print(f"overview: {result['overview']}") 

top_k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.Vector'> and will be ignored


<iterator object azure.core.paging.ItemPaged at 0x1341a21b880>
title: City Lights
id: 52
Score: 0.03333333507180214
genre: Comedy, Drama, Romance
overview: With the aid of a wealthy erratic tippler, a dewy-eyed tramp who has fallen in love with a sightless flower girl accumulates money to be able to help her medically.
title: Psycho
id: 49
Score: 0.016393441706895828
genre: Horror, Mystery, Thriller
overview: A Phoenix secretary embezzles $40,000 from her employer's client, goes on the run, and checks into a remote motel run by a young man under the domination of his mother.
title: Il buono, il brutto, il cattivo
id: 12
Score: 0.016129031777381897
genre: Western
overview: A bounty hunting scam joins two men in an uneasy alliance against a third in a race to find a fortune in gold buried in a remote cemetery.
