In [None]:
!pip install cassandra-driver

In [None]:
#!pip install fiddler-client==2.1.0

In [None]:
!pip install langchain

In [None]:
!pip install typing-inspect==0.8.0 typing_extensions==4.5.0

In [None]:
pip install pydantic -U

In [None]:
pip install pydantic==1.10.11

In [None]:
!pip install flask-sqlalchemy

In [None]:
!pip install unstructured

In [None]:
! pip install -q --progress-bar off \
    "git+https://github.com/hemidactylus/langchain@updated-full-preview--lab#egg=langchain&subdirectory=libs/langchain" \
    "cassio>=0.1.1" \
    "google-cloud-aiplatform>=1.25.0" \
    "jupyter>=1.0.0" \
    "openai==0.27.7" \
    "python-dotenv==1.0.0" \
    "tensorflow-cpu==2.12.0" \
    "tiktoken==0.4.0" \
    "transformers>=4.29.2" 
exit()

In [25]:
import cassandra
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json
import pandas as pd
import fiddler as fdl

In [26]:
fdl.__version__

'1.8.5'

In [27]:
cassandra.__version__

'3.28.0'

In [46]:
# This secure connect bundle is autogenerated when you donwload your SCB, 
# if yours is different update the file name below
cloud_config= {
  'secure_connect_bundle': 'datastax_auth/secure-connect-fiddlerai.zip'
}

# This token json file is autogenerated when you donwload your token, 
# if yours is different update the file name below
with open("datastax_auth/danny@fiddler.ai-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

In [47]:
session.set_keyspace('fiddlerai')

In [30]:
# Set your secret(s) for LLM access:
llmProvider = 'OpenAI'  # 'GCP_VertexAI', 'Azure_OpenAI'

In [31]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, DirectoryLoader

In [32]:
from langchain.vectorstores.cassandra import Cassandra

In [33]:
import os
import openai

In [None]:
os.environ['OPENAI_API_TYPE'] = 'open_ai'
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
llm = OpenAI(temperature=0)
myEmbedding = OpenAIEmbeddings()
print('LLM+embeddings from OpenAI')

In [None]:
table_name = 'fiddler_doc_snippets_' + llmProvider

index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Cassandra,
    embedding=myEmbedding,
    text_splitter=CharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=0,
    ),
    vectorstore_kwargs={
        'session': session,
        'keyspace': 'fiddlerai',
        'table_name': table_name,
    },
)

In [None]:
#loader = TextLoader('texts/amontillado.txt', encoding='utf8')

In [None]:
loader = DirectoryLoader('readme/fiddler-2023-10-10/v23.4/', glob="**/*.md")
docs = loader.load()

In [None]:
len(docs)

In [None]:
index = index_creator.from_loaders([loader])

In [43]:
cqlSelect = f'SELECT * FROM fiddlerai.{table_name} LIMIT 30;'  # (Not a production-optimized query ...)
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    # depending on the cassIO version, the underlying Cassandra table can have different structure ...
    try:
        # you are using the new cassIO 0.1.0+ : congratulations :)
        print(f'    row_id:            {row.row_id}')
        print(f'    vector:            {str(row.vector)[:64]} ...')
        print(f'    body_blob:         {row.body_blob[:64]} ...')
        print(f'    metadata_s:        {row.metadata_s}')        
    except AttributeError:
        # Please upgrade your cassIO to the latest version ...
        print(f'    document_id:      {row.document_id}')
        print(f'    embedding_vector: {str(row.embedding_vector)[:64]} ...')
        print(f'    document:         {row.document[:64]} ...')
        print(f'    metadata_blob:    {row.metadata_blob}')

print('\n...')


Row 0:
    row_id:            2a5a3ad7f553497eadfaab74a303bfea
    vector:            [-0.006706948857754469, -0.012583066709339619, 0.007504278328269 ...
    body_blob:         We believe fairness should be ensured to all subgroups of the po ...
    metadata_s:        {'source': 'readme/fiddler-2023-10-10/v23.4/Platform Guide/fairness.md'}

Row 1:
    row_id:            aedcb2dda54b4cc88a24e9d626434f4e
    vector:            [0.0014642368769273162, -0.0029895897023379803, 0.01390048861503 ...
    body_blob:         Updates to the Fiddler containers is accomplished through a shar ...
    metadata_s:        {'source': 'readme/fiddler-2023-10-10/v23.4/Deployment Guide/deploying-fiddler/system-architecture.md'}

Row 2:
    row_id:            68e1f76319584c8fbf68829dfb7568dd
    vector:            [-0.03531530126929283, -0.01197117194533348, -0.0081756031140685 ...
    body_blob:         python Python
events_dict = grouped_df_graded.to_dict('index')
f ...
    metadata_s:        {'source':

In [None]:
query = "What is Fiddler?"
index.query(query, llm=llm)

In [61]:
createTableSQL = """CREATE TABLE fiddler_chatbot_history (
    row_id text PRIMARY KEY,
    response text,
    response_vector vector<float, 1536>,
    source_docs text,
    source_docs_vector vector<float, 1536>,
    question text,
    question_vector vector<float, 1536>,
    comment text,
    feedback int,
    metadata_s map<text, text>,
    ts timestamp)"""
# ) WITH additional_write_policy = '99p'
#     AND bloom_filter_fp_chance = 0.01
#     AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
#     AND comment = ''
#     AND compaction = {'class': 'org.apache.cassandra.db.compaction.UnifiedCompactionStrategy'}
#     AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
#     AND crc_check_chance = 1.0
#     AND default_time_to_live = 0
#     AND gc_grace_seconds = 864000
#     AND max_index_interval = 2048
#     AND memtable_flush_period_in_ms = 0
#     AND min_index_interval = 128
#     AND read_repair = 'BLOCKING'
#     AND speculative_retry = '99p';"""

In [62]:
session.execute(createTableSQL)

<cassandra.cluster.ResultSet at 0x7fefb1e818b0>

In [58]:
EMBEDDING_MODEL = "text-embedding-ada-002"
response = openai.Embedding.create(model=EMBEDDING_MODEL, input='How are you doing')
response

<OpenAIObject list at 0x7fefb250b180> JSON: {
  "data": [
    {
      "embedding": [
        -0.024863969534635544,
        -0.0014291018014773726,
        0.0032793283462524414,
        -0.028412306681275368,
        -0.023839179426431656,
        0.02565818279981613,
        -0.018599940463900566,
        -0.015768958255648613,
        -0.01575614884495735,
        -0.012246241793036461,
        0.027925530448555946,
        0.008723526261746883,
        -0.009479308500885963,
        0.007986958138644695,
        0.0032713222317397594,
        -0.017664819955825806,
        0.040274251252412796,
        -0.003122407477349043,
        -0.0039006073493510485,
        -0.012681777589023113,
        0.015077224932610989,
        -0.004678807221353054,
        -0.00045354969915933907,
        0.005236037075519562,
        -0.0179466363042593,
        -0.0028421913739293814,
        0.003980669192969799,
        -0.010369595140218735,
        0.0197912584990263,
        -0.035790793597698

In [None]:
def pandas_factory(colnames, rows):
    return pd.DataFrame(rows, columns=colnames)

session.row_factory = pandas_factory
session.default_fetch_size = None

In [None]:
rows = session.execute('SELECT * from squad')
    
df_baseline = rows._current_rows
df_baseline

In [None]:
column_types = df_baseline.dtypes

print(column_types)

In [None]:
df_baseline['answers'] = df_baseline['answers'].apply(lambda x : str(x))