In [114]:
from llama_index.llms.anthropic import Anthropic
from llama_index.core.query_engine import PandasQueryEngine
from llama_index.core import Settings
import os
from dotenv import load_dotenv
# tokenizer = Anthropic().tokenizer
# Settings.tokenizer = tokenizer
load_dotenv()
import pandas as pd

pd.set_option('display.max_colwidth', None)
import string
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from llama_index.readers.file import CSVReader
from pathlib import Path
from llama_index.core import StorageContext, VectorStoreIndex, load_index_from_storage, SimpleDirectoryReader, ServiceContext
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool, QueryEngineTool, ToolMetadata

In [2]:
file_path = os.path.join('..','data','omfh_backup.csv')

In [3]:
df = pd.read_csv(file_path)
df.Motivation.fillna('',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Motivation.fillna('',inplace=True)


In [4]:
def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
df['clean_text'] = df.Motivation.apply(clean)

In [101]:
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
# To customize your API key, do this
# otherwise it will lookup ANTHROPIC_API_KEY from your env variable
# llm = Anthropic(api_key="<>)
llm = Anthropic(model="claude-3-opus-20240229")

# resp = llm.complete("Paul Graham is ", max_tokens=5)


In [103]:
from llama_index.core import Settings
tokenizer = Anthropic().tokenizer
Settings.tokenizer = tokenizer
Settings.llm = Anthropic(model="claude-3-opus-20240229",api_key=ANTHROPIC_API_KEY)

In [111]:
from llama_index.core import PromptTemplate


instruction_str = """\
    1. Convert the query to executable Python code using Pandas.
    2. The final line of code should be a Python expression that can be called with the `eval()` function.
    3. The code should represent a solution to the query.
    4. PRINT ONLY THE EXPRESSION.
    5. Do not quote the expression."""

new_prompt = PromptTemplate(
    """\
    You are working with a pandas dataframe in Python.
    The name of the dataframe is `df`.
    This is the result of `print(df.head())`:
    {df_str}

    Follow these instructions:
    {instruction_str}
    Query: {query_str}

    Expression: """
)

context = """Purpose: The primary role of this agent is to assist users by providing accurate 
            information about the patients in this data set who have signed up for a breathing class. The agen has 
            the choice to query the dataframe usings pandas or using it's llm capabilities to answer the question."""

In [8]:
df_query_engine = PandasQueryEngine(df=df.head(),verbose=True,instruction_str=instruction_str)

In [9]:
df_query_engine.update_prompts({"pandas_prompt": new_prompt})
df_query_engine.query("how man females are in and where are they from?")

> Pandas Instructions:
```
df[(df['Gender'] == 'Female')]['County'].value_counts()
```
> Pandas Output: County
Liverpool     1
Merseyside    1
Name: count, dtype: int64


Response(response='County\nLiverpool     1\nMerseyside    1\nName: count, dtype: int64', source_nodes=[], metadata={'pandas_instruction_str': "df[(df['Gender'] == 'Female')]['County'].value_counts()", 'raw_pandas_output': 'County\nLiverpool     1\nMerseyside    1\nName: count, dtype: int64'})

In [None]:
file_path

In [57]:
df_head= df.head()
df_head.to_csv('../data/df_head.csv')

In [58]:
documents = CSVReader().load_data(Path('../data/df_head.csv'))

In [104]:
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm, embed_model="local")

  service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm, embed_model="local")


In [96]:
def get_index(data, index_name):
    index = None
    if not os.path.exists(index_name):
        print("building index", index_name)
        index = VectorStoreIndex.from_documents(data, show_progress=True,llm=llm,service_context=service_context)
        index.storage_context.persist(persist_dir=index_name)
    else:
        index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=index_name)
        )

    return index


In [105]:
df_index = VectorStoreIndex.from_documents(documents, show_progress=True,llm=llm,service_context=service_context)




Parsing nodes:   0%|                                                                 | 0/1 [00:00<?, ?it/s][A[A

Parsing nodes: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.19it/s][A[A


Generating embeddings:   0%|                                                         | 0/2 [00:00<?, ?it/s][A[A

Generating embeddings: 100%|█████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.11s/it][A[A


In [72]:
df_index = get_index(documents,'df')

In [110]:
query_engine_llm = df_index.as_query_engine(llm=llm)
# response = query_engine_llm.query("what's the main motivation for the patients to join the class??")
print(response)

Based on the information provided, the main motivations for joining the mindfulness class include:

1. Learning techniques to better manage and accept chronic pain conditions like migraines, neck pain, knee pain, fibromyalgia, and long COVID symptoms.

2. Gaining deeper knowledge of mindfulness to apply in their professional work, such as an occupational therapist wanting to use it with patients. 

3. Finding ways to reduce stress and anxiety, listen to their body, and make time for relaxation and healing, especially for those with busy lives as full-time workers or single parents.

4. Incorporating mindfulness into their daily life, as some have been influenced by therapists to include more relaxation practices to help control pain, discomfort and unhelpful thoughts.

So in summary, the key drivers are learning skills to cope with chronic health issues, reducing stress, and applying mindfulness personally and professionally.


In [115]:
tools = [
    QueryEngineTool(
        query_engine=df_query_engine,
        metadata=ToolMetadata(
            name="df_data",
            description="this translates human lanuage into a pandas query",
        ),
    ),
    QueryEngineTool(
            query_engine=query_engine_llm,
        metadata=ToolMetadata(
            name="llm_data",
            description="This queries the llm to answer text style questions",
        ),
    ),
]

llm = Anthropic(model="claude-3-opus-20240229")
agent = ReActAgent.from_tools(tools, llm=llm, verbose=True, context=context)

In [116]:
while (prompt := input("Enter a prompt (q to quit): ")) != "q":
    result = agent.query(prompt)
    print(result)

Enter a prompt (q to quit): how many patients are there
[1;3;38;5;200mThought: I need to use a tool to help me answer the question.
Action: df_data
Action Input: {'input': 'how many rows are in the dataframe?'}
[0m> Pandas Instructions:
```
len(df)
```
> Pandas Output: 5
[1;3;34mObservation: 5
[0m[1;3;38;5;200mThought: I can answer without using any more tools.
Answer: There are 5 patients in the dataset.
[0mThere are 5 patients in the dataset.
Enter a prompt (q to quit): what is the main motivation for patients to join the class?
[1;3;38;5;200mThought: I need to use a tool to help me answer the question.
Action: llm_data
Action Input: {'input': 'what is the main motivation for patients to join the breathing class?'}
[0m[1;3;34mObservation: Based on the information provided, the main motivations for patients to join the breathing class include:

1. Learning techniques to help deal with chronic pain conditions like migraines, neck pain, knee pain, fibromyalgia, and long COVID. 