## <span style="color:#ff5f27">📝 Imports </span>

In [None]:
#!pip install -r requirements.txt -q

In [None]:
import os
import pandas as pd

import hopsworks

from hsfs import embedding
from hsfs.feature import Feature

from openai import OpenAI
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

from pipelines.guidelines import get_reports_df
from functions.pdf_preprocess import get_file_paths

import config

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the .env file
load_dotenv()

## <span style="color:#ff5f27">💾 Download guidelines and extract text </span>

In [None]:
urls = get_file_paths("data")

In [None]:
ai_report_text_processed_df = get_reports_df(urls)

In [None]:
ai_report_text_processed_df

## <span style="color:#ff5f27">⚙️ Create Embeddings For semantic search </span>

### AI report embeddings

In [None]:
# Load the SentenceTransformer model
report_sentence_transformer = SentenceTransformer(
    config.MODEL_SENTENCE_TRANSFORMER,
).to(config.DEVICE)

ai_report_text_processed_df = ai_report_text_processed_df.reset_index(drop=True)

# Generate embeddings for the 'text' column using the SentenceTransformer model
ai_report_text_processed_df['embeddings'] = pd.Series(
    report_sentence_transformer.encode(ai_report_text_processed_df['text']).tolist(),
)

# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
ai_report_text_processed_df['context_id'] = [*range(ai_report_text_processed_df.shape[0])]

# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns
ai_report_text_processed_df

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
project = hopsworks.login()
fs = project.get_feature_store() 

## <span style="color:#ff5f27;"> 🪄 Create Feature Groups  </span>

### Stanford AI reports feature group

In [None]:
# Create the Embedding Index
administrative_protocols_emb = embedding.EmbeddingIndex()

administrative_protocols_emb.add_embedding(
    "embeddings", 
    report_sentence_transformer .get_sentence_embedding_dimension(),
)

In [None]:
administrative_protocols_features = [
           Feature(name='name', type='string', online_type='varchar(100)'),
           Feature(name='url', type='string', online_type='varchar(100)'),
           Feature(name='source', type='string', online_type='varchar(100)'),
           Feature(name='page_number', type='bigint', online_type='bigint'),
           Feature(name='paragraph', type='bigint', online_type='bigint'),
           Feature(name='text', type='string', online_type='VARCHAR(5900)'),
           Feature(name='year', type='bigint', online_type='bigint'),
           Feature(name='timestamp', type='timestamp', online_type='timestamp'),
           Feature(name='embeddings', type='array<double>', online_type='varbinary(100)'),
           Feature(name='context_id', type='bigint', online_type='bigint')
]

# Get or create the 'documents_fg' feature group
administrative_protocols_fg = fs.get_or_create_feature_group(
    name="administrative_protocols",
    version=1,
    description='Medical and administrative protocols from a Swedish forensic psychiatric examination unit.',
    primary_key=['context_id'],
    online_enabled=True,
    embedding_index=administrative_protocols_emb
)

if administrative_protocols_fg.id is None:
    administrative_protocols_fg.save(administrative_protocols_features)
administrative_protocols_fg.insert(ai_report_text_processed_df)

In [None]:
# Define the features for user feedback
user_feedback_features = [
    Feature(name='feedback_id', type='string'),
    Feature(name='user_query', type='string'),
    Feature(name='assistant_response', type='string'),
    Feature(name='like', type='string'),
    Feature(name='feedback', type='string'),
    Feature(name='session_id', type='string'),
    Feature(name='timestamp', type='timestamp')
]
                
# Create the feature group
feedback_fg = fs.get_or_create_feature_group(
    name="user_feedback",
    version=1,
    description='User feedback on responses from the RMV assistant chatbot',
    primary_key=['feedback_id'],
    online_enabled=False,
    stream=True
)
                
# Save the feature definitions
if feedback_fg.id is None:
    feedback_fg.save(user_feedback_features)

## <span style="color:#ff5f27;">🪄 Create Feature Views </span>

In [None]:
# Get or create the 'stanford_reports' feature view
feature_view = fs.get_or_create_feature_view(
    name="administrative_protocols",
    version=1,
    description='Medical and administrative protocols from a Swedish forensic psychiatric examination unit for RAG system',
    query=administrative_protocols_fg.select(["name", "url", "source", "page_number", "paragraph", "text", "year"]),
)

---