### Installs and Imports

In [None]:
!pip install -q cassandra-driver
!pip install -q cassio
!pip install -q langchain
!pip install -q typing-inspect==0.8.0 typing_extensions==4.5.0
!pip install -q pydantic==1.10.11
!pip install -q flask-sqlalchemy
!pip install -q unstructured

In [None]:
! pip install -q --progress-bar off \
    "git+https://github.com/hemidactylus/langchain@updated-full-preview--lab#egg=langchain&subdirectory=libs/langchain" \
    "cassio>=0.1.1" \
    "google-cloud-aiplatform>=1.25.0" \
    "jupyter>=1.0.0" \
    "openai==0.27.7" \
    "python-dotenv==1.0.0" \
    "tensorflow-cpu==2.12.0" \
    "tiktoken==0.4.0" \
    "transformers>=4.29.2" 

In [None]:
import cassandra
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json
import os
import pandas as pd
import fiddler as fdl

In [None]:
cassandra.__version__

### Connect to DataStax

In [None]:
# This secure connect bundle is autogenerated when you donwload your SCB, 
# if yours is different update the file name below
cloud_config= {'secure_connect_bundle': 'datastax_auth/secure-connect-fiddlerai.zip'}

ASTRA_DB_APPLICATION_TOKEN = 'Your Token'
print("TOKEN: " + ASTRA_DB_APPLICATION_TOKEN)

auth_provider=PlainTextAuthProvider("token", ASTRA_DB_APPLICATION_TOKEN)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)

session = cluster.connect()

In [None]:
session.set_keyspace('fiddlerai')

In [None]:
os.environ["OPENAI_API_KEY"] = 'OPENAI Token'

In [None]:
# Set your secret(s) for LLM access:
llmProvider = 'OpenAI'  # 'GCP_VertexAI', 'Azure_OpenAI'

In [None]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, DirectoryLoader, DataFrameLoader

In [None]:
from langchain.vectorstores.cassandra import Cassandra

In [None]:
import os
import openai

In [None]:
os.environ['OPENAI_API_TYPE'] = 'open_ai'
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
llm = OpenAI(temperature=0)
myEmbedding = OpenAIEmbeddings()
print('LLM+embeddings from OpenAI')

In [None]:
table_name = 'fiddler_doc_snippets_' + llmProvider

index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Cassandra,
    embedding=myEmbedding,
    text_splitter=CharacterTextSplitter(
        chunk_size=6000,
        chunk_overlap=0,
    ),
    vectorstore_kwargs={
        'session': session,
        'keyspace': 'fiddlerai',
        'table_name': table_name,
    },
)

In [None]:
df = pd.read_csv('documentation_data/vector_index_feed_24.7.csv')
df

Please double-check your dataframe before running the next cell. 
Running the next cell will delete the existing snippets from the last version of the docs. 

In [None]:
#session.execute("TRUNCATE TABLE fiddler_doc_snippets_openai")

In [None]:
#prepare the loader
loader = DataFrameLoader(df, page_content_column="text")
len(loader.load())

In [None]:
# this step takes the loader datafreame and pushes it to Datastax and generates embeddings 
index = index_creator.from_loaders([loader])
index

You are done. Please check the Chatbot to ensure it is working and returning answers from the lastest docs we just uploaded.

## Sample code for querying the datastax Vector DB to ensure we have the right data there

In [None]:
cqlSelect = f'SELECT count(*) FROM fiddlerai.{table_name};'  # (Not a production-optimized query ...)

rows = session.execute(cqlSelect)
print(rows)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    # depending on the cassIO version, the underlying Cassandra table can have different structure ...
    try:
        # you are using the new cassIO 0.1.0+ : congratulations :)
        print(f'    row_id:            {row.row_id}')
        print(f'    vector:            {str(row.vector)[:64]} ...')
        print(f'    body_blob:         {row.body_blob} ...')
        print(f'    metadata_s:        {row.metadata_s}')        
    except AttributeError:
        # Please upgrade your cassIO to the latest version ...
        print(f'    document_id:      {row.document_id}')
        print(f'    embedding_vector: {str(row.embedding_vector)[:64]} ...')
        print(f'    document:         {row.document[:64]} ...')
        print(f'    metadata_blob:    {row.metadata_blob}')

print('\n...')

In [None]:
query = "What is Fiddler?"
index.query(query, llm=llm)

In [None]:
createTableSQL = """CREATE TABLE fiddler_chatbot_history (
    row_id text PRIMARY KEY,
    response text,
    response_vector vector<float, 1536>,
    source_docs text,
    source_docs_vector vector<float, 1536>,
    question text,
    question_vector vector<float, 1536>,
    comment text,
    feedback int,
    metadata_s map<text, text>,
    ts timestamp)"""
# ) WITH additional_write_policy = '99p'
#     AND bloom_filter_fp_chance = 0.01
#     AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
#     AND comment = ''
#     AND compaction = {'class': 'org.apache.cassandra.db.compaction.UnifiedCompactionStrategy'}
#     AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
#     AND crc_check_chance = 1.0
#     AND default_time_to_live = 0
#     AND gc_grace_seconds = 864000
#     AND max_index_interval = 2048
#     AND memtable_flush_period_in_ms = 0
#     AND min_index_interval = 128
#     AND read_repair = 'BLOCKING'
#     AND speculative_retry = '99p';"""

In [None]:
session.execute(createTableSQL)

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"
response = openai.Embedding.create(model=EMBEDDING_MODEL, input='How are you doing')
response

In [None]:
def pandas_factory(colnames, rows):
    return pd.DataFrame(rows, columns=colnames)

session.row_factory = pandas_factory
session.default_fetch_size = None

In [None]:
rows = session.execute('SELECT * from squad')
    
df_baseline = rows._current_rows
df_baseline

In [None]:
column_types = df_baseline.dtypes

print(column_types)

In [None]:
df_baseline['answers'] = df_baseline['answers'].apply(lambda x : str(x))