## <span style="color:#ff5f27">📝 Imports </span>

In [None]:
#!pip install -r requirements.txt -q

In [None]:
import os
import pandas as pd

import hopsworks

from hsfs import embedding
from hsfs.feature import Feature

from openai import OpenAI
from sentence_transformers import SentenceTransformer

from pipelines.stanford_reports import get_reports_df
from pipelines.portfolio_companies import get_portfolio_df

import config

import warnings
warnings.filterwarnings('ignore')

## <span style="color:#ff5f27">💾 Download Stanford AI index reports and extract text </span>

In [None]:
ai_report_text_processed_df = get_reports_df()

## <span style="color:#ff5f27">💾 Fetch EQT X porfolio companies websites and extract text </span>

In [None]:
eqt_x_portfolio_text_processed_df = get_portfolio_df()

## <span style="color:#ff5f27">⚙️ Create Embeddings For semantic search </span>

### AI report embeddings

In [None]:
# Load the SentenceTransformer model
report_sentence_transformer = SentenceTransformer(
    config.MODEL_SENTENCE_TRANSFORMER,
).to(config.DEVICE)

ai_report_text_processed_df = ai_report_text_processed_df.reset_index(drop=True)

# Generate embeddings for the 'text' column using the SentenceTransformer model
ai_report_text_processed_df['embeddings'] = pd.Series(
    report_sentence_transformer.encode(ai_report_text_processed_df['text']).tolist(),
)

# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
ai_report_text_processed_df['context_id'] = [*range(ai_report_text_processed_df.shape[0])]

# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns
ai_report_text_processed_df

### Portfolio company embeddings

In [None]:
# Load the SentenceTransformer model
portfolio_sentence_transformer = SentenceTransformer(
    config.MODEL_SENTENCE_TRANSFORMER,
).to(config.DEVICE)

eqt_x_portfolio_text_processed_df = eqt_x_portfolio_text_processed_df.reset_index(drop=True)

# Generate embeddings for the 'text' column using the SentenceTransformer model
eqt_x_portfolio_text_processed_df['embeddings'] = pd.Series(
    portfolio_sentence_transformer.encode(eqt_x_portfolio_text_processed_df['text']).tolist(),
)

# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
eqt_x_portfolio_text_processed_df['context_id'] = [*range(eqt_x_portfolio_text_processed_df.shape[0])]

# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns
eqt_x_portfolio_text_processed_df.text

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
project = hopsworks.login()
fs = project.get_feature_store() 

## <span style="color:#ff5f27;"> 🪄 Create Feature Groups  </span>

### Stanford AI reports feature group

In [None]:
# Create the Embedding Index
stanford_report_emb = embedding.EmbeddingIndex()

stanford_report_emb.add_embedding(
    "embeddings", 
    report_sentence_transformer .get_sentence_embedding_dimension(),
)

In [None]:
stanford_report_features = [
           Feature(name='name', type='string', online_type='varchar(100)'),
           Feature(name='url', type='string', online_type='varchar(100)'),
           Feature(name='source', type='string', online_type='varchar(100)'),
           Feature(name='page_number', type='bigint', online_type='bigint'),
           Feature(name='paragraph', type='bigint', online_type='bigint'),
           Feature(name='text', type='string', online_type='VARCHAR(5900)'),
           Feature(name='year', type='bigint', online_type='bigint'),
           Feature(name='timestamp', type='timestamp', online_type='timestamp'),
           Feature(name='embeddings', type='array<double>', online_type='varbinary(100)'),
           Feature(name='context_id', type='bigint', online_type='bigint')
]

# Get or create the 'documents_fg' feature group
stanford_reports_fg = fs.get_or_create_feature_group(
    name="stanford_reports",
    version=1,
    description='Stanford AI report.',
    primary_key=['context_id'],
    online_enabled=True,
    embedding_index=stanford_report_emb
)

stanford_reports_fg.save(stanford_report_features)
stanford_reports_fg.insert(ai_report_text_processed_df)

### EQT X portfolio feature group

In [None]:
# Create the Embedding Index
portfolio_emb = embedding.EmbeddingIndex()

portfolio_emb.add_embedding(
    "embeddings", 
    portfolio_sentence_transformer.get_sentence_embedding_dimension(),
)

In [None]:
portfolio_features = [
           Feature(name='name', type='string', online_type='varchar(100)'),
           Feature(name='url', type='string', online_type='varchar(100)'),
           Feature(name='source', type='string', online_type='varchar(100)'),
           Feature(name='page_number', type='bigint', online_type='bigint'),
           Feature(name='paragraph', type='bigint', online_type='bigint'),
           Feature(name='text', type='string', online_type='VARCHAR(5900)'),
           Feature(name='year', type='bigint', online_type='bigint'),
           Feature(name='timestamp', type='timestamp', online_type='timestamp'),
           Feature(name='embeddings', type='array<double>', online_type='varbinary(100)'),
           Feature(name='context_id', type='bigint', online_type='bigint')
]

# Get or create the 'documents_fg' feature group
portfolio_fg = fs.get_or_create_feature_group(
    name="eqt_portfolio",
    version=1,
    description='EQT portfolio companies.',
    primary_key=['name'],
    online_enabled=True,
    embedding_index=portfolio_emb
)

portfolio_fg.save(portfolio_features)
portfolio_fg.insert(eqt_x_portfolio_text_processed_df)

## <span style="color:#ff5f27;">🪄 Create Feature Views </span>

In [None]:
# Get or create the 'stanford_reports' feature view
feature_view = fs.get_or_create_feature_view(
    name="stanford_reports",
    version=1,
    description='Stanford reports for RAG system',
    query=stanford_reports_fg.select(["name", "url", "source", "page_number", "paragraph", "text", "year"]),
)

In [None]:
# Get or create the 'eqt_portfolio' feature view
feature_view = fs.get_or_create_feature_view(
    name="eqt_portfolio",
    version=1,
    description='Text data from EQT portfolio companies for RAG system',
    query=portfolio_fg.select(["name", "url", "source", "page_number", "paragraph", "text"]),
)

---