## Imports and Installs

In [None]:
!pip install cassandra-driver

In [None]:
!pip install langchain

In [None]:
!pip install typing-inspect==0.8.0 typing_extensions==4.5.0

In [None]:
pip install pydantic -U

In [None]:
pip install pydantic==1.10.11

In [None]:
!pip install flask-sqlalchemy

In [None]:
!pip install unstructured
!pip install -q pypdf

In [None]:
! pip install -q --progress-bar off \
    "git+https://github.com/hemidactylus/langchain@updated-full-preview--lab#egg=langchain&subdirectory=libs/langchain" \
    "cassio>=0.1.1" \
    "google-cloud-aiplatform>=1.25.0" \
    "jupyter>=1.0.0" \
    "openai==0.27.7" \
    "python-dotenv==1.0.0" \
    "tensorflow-cpu==2.12.0" \
    "tiktoken==0.4.0" \
    "transformers>=4.29.2" 
exit()

In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
import cassandra
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json
import os
import pandas as pd
import fiddler as fdl
import ast
from pypdf import PdfReader

In [None]:
fdl.__version__

In [None]:
cassandra.__version__

## Connect to Astra

In [None]:
# This secure connect bundle is autogenerated when you donwload your SCB, 
# if yours is different update the file name below
cloud_config= {
  'secure_connect_bundle': 'datastax_auth/secure-connect-fiddlerai.zip'
}

ASTRA_DB_APPLICATION_TOKEN = ''


auth_provider=PlainTextAuthProvider("token", ASTRA_DB_APPLICATION_TOKEN)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)

session = cluster.connect()

In [None]:
session.set_keyspace('fiddlerai')

In [None]:
# Set your secret(s) for LLM access:
llmProvider = 'OpenAI'  # 'GCP_VertexAI', 'Azure_OpenAI'

In [None]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, DirectoryLoader, DataFrameLoader, GoogleDriveLoader
from langchain.vectorstores.cassandra import Cassandra
import os
import openai

In [None]:
os.environ['OPENAI_API_TYPE'] = 'open_ai'
os.environ['OPENAI_API_KEY'] = ''
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
llm = OpenAI(temperature=0)
myEmbedding = OpenAIEmbeddings()
print('LLM+embeddings from OpenAI')

In [None]:
table_name = 'fiddler_rfp_snippets_' + llmProvider

index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Cassandra,
    embedding=myEmbedding,
    text_splitter=CharacterTextSplitter(
        chunk_size=5000,
        chunk_overlap=0,
    ),
    vectorstore_kwargs={
        'session': session,
        'keyspace': 'fiddlerai',
        'table_name': table_name,
    },
)

## Preprocessing Documents

### Read the CSV file with Answers Library

In [67]:
import pandas as pd

df = pd.read_csv('documentation_data/RFP Answer Library - Answer Library.csv')
df_doc = df[['Snippet']]
snippets = list(df['Snippet'])
len(snippets)

1130

### Convert PDFs to chunked docs that can be appended to the snippets

In [None]:
chunked_doc = []
for root, dirs, files in os.walk("documentation_data/Docs"): # make sure all PDF files are in "documentation_data/Docs"
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == 'pdf':
            reader = PdfReader(path)
            for i in range (len(reader.pages)):
                page = reader.pages[i] 
                file_str = page.extract_text() 
                chunked_doc.append(file_str)

In [None]:
snippets_new = chunked_doc + snippets # add PDF chuncks to snippets dataframe
len(snippets_new)

https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to

In [None]:
df = pd.DataFrame(snippets_new)
df.columns= ['Snippet']
df

## Finally Let's Load the Documents

"TRUNCATE TABLE fiddler_rfp_snippets_openai" will drop all the existing snippets in AstraDB. Only do this once you are ready to load the new documents.

In [None]:
#session.execute("TRUNCATE TABLE fiddler_rfp_snippets_openai")

In [None]:
loader = DataFrameLoader(df, page_content_column="Snippet")

In [None]:
len(loader.load())

In [None]:
index = index_creator.from_loaders([loader])
index

And you are done. Please go check the application to make sure it is working. 

You can use the optional scripts below to query the Vector DB we just added the docs to 

## Optional

In [None]:
cqlSelect = f'SELECT * FROM fiddlerai.{table_name} LIMIT 30;'  # (Not a production-optimized query ...)
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    # depending on the cassIO version, the underlying Cassandra table can have different structure ...
    try:
        # you are using the new cassIO 0.1.0+ : congratulations :)
        print(f'    row_id:            {row.row_id}')
        print(f'    vector:            {str(row.vector)[:64]} ...')
        print(f'    body_blob:         {row.body_blob[:64]} ...')
        print(f'    metadata_s:        {row.metadata_s}')        
    except AttributeError:
        # Please upgrade your cassIO to the latest version ...
        print(f'    document_id:      {row.document_id}')
        print(f'    embedding_vector: {str(row.embedding_vector)[:64]} ...')
        print(f'    document:         {row.document[:64]} ...')
        print(f'    metadata_blob:    {row.metadata_blob}')

print('\n...')

In [None]:
query = "What is Fiddler?"
index.query(query, llm=llm)

In [None]:
createTableSQL = """CREATE TABLE fiddler_chatbot_history (
    row_id text PRIMARY KEY,
    response text,
    response_vector vector<float, 1536>,
    source_docs text,
    source_docs_vector vector<float, 1536>,
    question text,
    question_vector vector<float, 1536>,
    comment text,
    feedback int,
    metadata_s map<text, text>,
    ts timestamp)"""
# ) WITH additional_write_policy = '99p'
#     AND bloom_filter_fp_chance = 0.01
#     AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
#     AND comment = ''
#     AND compaction = {'class': 'org.apache.cassandra.db.compaction.UnifiedCompactionStrategy'}
#     AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
#     AND crc_check_chance = 1.0
#     AND default_time_to_live = 0
#     AND gc_grace_seconds = 864000
#     AND max_index_interval = 2048
#     AND memtable_flush_period_in_ms = 0
#     AND min_index_interval = 128
#     AND read_repair = 'BLOCKING'
#     AND speculative_retry = '99p';"""

In [None]:
session.execute(createTableSQL)

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"
response = openai.Embedding.create(model=EMBEDDING_MODEL, input='How are you doing')
response

In [None]:
def pandas_factory(colnames, rows):
    return pd.DataFrame(rows, columns=colnames)

session.row_factory = pandas_factory
session.default_fetch_size = None

In [None]:
rows = session.execute('SELECT * from squad')
    
df_baseline = rows._current_rows
df_baseline

In [None]:
column_types = df_baseline.dtypes

print(column_types)

In [None]:
df_baseline['answers'] = df_baseline['answers'].apply(lambda x : str(x))