In [5]:
# Import required libraries
from langchain.document_loaders import (
    PyMuPDFLoader,  # For loading PDF files
    DirectoryLoader,  # For loading files from a directory
    TextLoader,  # For loading plain text files
    Docx2txtLoader,  # For loading DOCX files
    UnstructuredPowerPointLoader,  # For loading PPTX files
    UnstructuredExcelLoader  # For loading XLSX files
)
from langchain.schema import Document
from langchain.document_loaders.csv_loader import CSVLoader  # For loading CSV files
# For splitting text into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# For creating a vector database for similarity search
from langchain.vectorstores import Pinecone
# For generating embeddings with OpenAI's embedding model
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

from dotenv import load_dotenv  # For loading environment variables from .env file
import os

# Load environment variables from .env file
load_dotenv()

# Replace with the name of the directory carrying your data
data_directory = "new_data"

pinecone.init(api_key=os.getenv("PINECONE_API_KEY"),
              environment=os.getenv("PINECONE_ENVIRONMENT"))

# Load your documents from different sources


def get_documents():
    # Create loaders for PDF, text, CSV, DOCX, PPTX, XLSX files in the specified directory
    pdf_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.pdf", loader_cls=PyMuPDFLoader)
    txt_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.txt", loader_cls=TextLoader,loader_kwargs={'autodetect_encoding': True})
    csv_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.csv", loader_cls=CSVLoader)
    docx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.docx", loader_cls=Docx2txtLoader)
    pptx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
    xlsx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.xlsx", loader_cls=UnstructuredExcelLoader)

    # Initialize the 'docs' variable
    docs = None

    # Load files using the respective loaders
    pdf_data = pdf_loader.load()
    txt_data = txt_loader.load()
    csv_data = csv_loader.load()
    docx_data = docx_loader.load()
    pptx_data = pptx_loader.load()
    xlsx_data = xlsx_loader.load()

    # Combine all loaded data into a single list
    docs = pdf_data + txt_data + csv_data + docx_data + pptx_data + xlsx_data

    # Return all loaded data
    return docs


# Get the raw documents from different sources
raw_docs = get_documents()

# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=5)

docs = text_splitter.split_documents(raw_docs)

print(f"Total docs: {len(docs)}")

Total docs: 1370


In [6]:
from doctran import Doctran
from dotenv import load_dotenv  # For loading environment variables from .env file
import os
# Load environment variables from .env file
load_dotenv()
doctran = Doctran(openai_api_key=os.getenv("OPENAI_API_KEY"))

In [7]:
for doc in docs:
    document = doctran.parse(content=doc.page_content)
    transformed_document = await document.interrogate().execute()
    doc.page_content = {
        "body":doc.page_content,
        "QnAs": transformed_document.extracted_properties
    }

In [2]:
from pinecone_text.sparse import BM25Encoder

# or from pinecone_text.sparse import SpladeEncoder if you wish to work with SPLADE

# use default tf-idf values
bm25_encoder = BM25Encoder().default()

In [7]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [8]:
corpus = [doc.page_content for doc in docs]

# fit tf-idf values on your corpus
bm25_encoder.fit(corpus)

# store the values to a json file
bm25_encoder.dump("bm25_values_for_ddw.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values_for_ddw.json")

  0%|          | 0/1370 [00:00<?, ?it/s]

In [9]:
from langchain.retrievers import PineconeHybridSearchRetriever
import pinecone

pinecone.init(api_key=os.getenv("PINECONE_API_KEY"),
              environment=os.getenv("PINECONE_ENVIRONMENT"))

index = pinecone.Index(os.getenv("PINECONE_INDEX"))
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

In [10]:
retriever.add_texts(texts=corpus,metadatas=[doc.metadata for doc in docs])

  0%|          | 0/43 [00:00<?, ?it/s]

In [1]:
import pandas as pd
from pandasai import SmartDataframe

# Read data from a CSV file
file_path = 'data\\DDW Location details shorter.csv'  # Replace 'your_file.csv' with the actual file path
df = pd.read_csv(file_path)

# Instantiate a LLM
from pandasai.llm import OpenAI
llm = OpenAI(api_token="sk-PPvmbUT1asdfasfdsfpeYT2ZSi")

# Create a SmartDataframe from the loaded data
sdf = SmartDataframe(df, config={"llm": llm})

# Chat with the SmartDataframe
sdf.chat('which exhibit includes an outdoor art installation, is open on October 25 that has contributors from multiple countries?')


'No exhibit open on October 25.'

In [1]:
from langchain.document_loaders.csv_loader import CSVLoader

In [4]:
loader = CSVLoader(file_path="test_csv.csv")
data = loader.load()

In [5]:
data

[Document(page_content='Index: 1\nOrganization Id: E84A904909dF528\nName: Liu-Hoover\nWebsite: http://www.day-hartman.org/\nCountry: Western Sahara\nDescription: Ergonomic zero administration knowledge user\nFounded: 1980\nIndustry: Online Publishing\nNumber of employees: 6852', metadata={'source': 'test_csv.csv', 'row': 0}),
 Document(page_content='Index: 2\nOrganization Id: AAC4f9aBF86EAeF\nName: Orr-Armstrong\nWebsite: https://www.chapman.net/\nCountry: Algeria\nDescription: Ergonomic radical budgetary management\nFounded: 1970\nIndustry: Import / Export\nNumber of employees: 7994', metadata={'source': 'test_csv.csv', 'row': 1}),
 Document(page_content="Index: 3\nOrganization Id: ad2eb3C8C24DB87\nName: Gill-Lamb\nWebsite: http://lin.com/\nCountry: Cote d'Ivoire\nDescription: Programmable intermediate conglomeration\nFounded: 2005\nIndustry: Apparel / Fashion\nNumber of employees: 5105", metadata={'source': 'test_csv.csv', 'row': 2}),
 Document(page_content='Index: 4\nOrganization Id

In [9]:
import boto3
from dotenv import load_dotenv
load_dotenv()
import os

# Initialize the S3 client
s3 = boto3.client('s3',
                  aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                  aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))

# Specify your S3 bucket name and the filename you want to use in S3
bucket_name = 'directionsappbucket'
file_name = 'sample.txt'

# Specify the local file path of the image you want to upload
local_file_path = 'sample.txt'

# Upload the image to S3
s3.upload_file(local_file_path, bucket_name, file_name)

print(f"Image uploaded to S3 bucket {bucket_name} with key {file_name}")


Image uploaded to S3 bucket directionsappbucket with key sample.txt


In [10]:
import boto3
from dotenv import load_dotenv
load_dotenv()
import os

# Initialize the S3 client
s3 = boto3.client('s3',
                  aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                  aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"))

# Specify your S3 bucket name and the filename you want to retrieve from S3
bucket_name = 'directionsappbucket'
file_name = 'sample.txt'

# Specify the local file path where you want to save the downloaded file
local_file_path = 'downloaded_sample.txt'

# Download the file from S3
try:
    s3.download_file(bucket_name, file_name, local_file_path)
    print(f"File '{file_name}' downloaded from S3 to '{local_file_path}'")
except Exception as e:
    print(f"Error downloading file from S3: {e}")


File 'sample.txt' downloaded from S3 to 'downloaded_sample.txt'


In [24]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
from dotenv import load_dotenv
load_dotenv()
from langchain.agents import create_csv_agent

agent = create_csv_agent(
    ChatOpenAI(temperature=0,model_name="gpt-3.5-turbo-16k"),
    "new data\\DDW_Location_details_with_services_FINAL.csv",
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)

agent.run("how many events have the public toilets facility?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to count the number of events that have the "Toilets available" facility.
Action: I will use the pandas `sum()` function to count the number of True values in the "Toilets available" column.
Action Input: `df["Toilets available"].sum()`[0m
Observation: I will use the pandas `sum()` function to count the number of True values in the "Toilets available" column. is not a valid tool, try one of [python_repl_ast].
Thought:[32;1m[1;3mI need to use the `python_repl_ast` tool to execute the pandas `sum()` function.
Action: `python_repl_ast`
Action Input: `df["Toilets available"].sum()`[0m
Observation: `python_repl_ast` is not a valid tool, try one of [python_repl_ast].
Thought:[32;1m[1;3mI need to use the `python_repl_ast` tool to execute the pandas `sum()` function.
Action: `python_repl_ast`
Action Input: `df["Toilets available"].sum()`[0m
Observation: `python_repl_ast` is not a valid tool, try one of [python_

KeyboardInterrupt: 

In [2]:
import pandas as pd
from pandasai import SmartDataframe
from pandasai.callbacks import StdoutCallback
from dotenv import load_dotenv
load_dotenv()
import os

df = pd.read_csv("new data\\DDW_Location_details_with_services_FINAL.csv")
# Instantiate a LLM
from pandasai.llm import OpenAI
from langchain.chat_models import ChatOpenAI

llm2 = ChatOpenAI()
llm = OpenAI(api_token=os.getenv("OPENAI_API_KEY"))

df = SmartDataframe(df, config={"llm": llm2,"verbose":True,"callback": StdoutCallback()})
answer = df.chat('where is the toilet?')

2023-10-07 11:46:02 [INFO] Question: where is the toilet?
2023-10-07 11:46:02 [INFO] Running PandasAI with langchain_openai-chat LLM...
2023-10-07 11:46:02 [INFO] Prompt ID: 8e74c485-d799-4adb-8e49-7f0225e69274


# TODO import all the dependencies required
import pandas as pd

def analyze_data(dfs: list[pd.DataFrame]) -> dict:
    """
    Analyze the data
    1. Prepare: Preprocessing and cleaning data if necessary
    2. Process: Manipulating data for analysis (grouping, filtering, aggregating, etc.)
    3. Analyze: Conducting the actual analysis (if the user asks to plot a chart save it to an image in temp_chart.png and do not show the chart.)
    4. Output: return a dictionary of:
    - type (possible values "text", "number", "dataframe", "plot")
    - value (can be a string, a dataframe or the path of the plot, NOT a dictionary)
    Example output: { "type": "text", "value": "The average loan amount is $15,000." }
    """
    
    # Get the dataframe dfs[0]
    df = dfs[0]
    
    # Check if the dataframe contains the column "Toilets available"
    if "Toilets available" in df.columns:
        # Filter the dataframe to only include rows where "Toilets available" is True
        filtered_df

In [3]:
answer

'The toilets are available at the following locations:\n                  Gasfabriek 5, Eindhoven, 5613CP\n          Kanaaldijknoord 57 C, Eindhoven, 5642JA\n              Dirk Boutslaan 1, Eindhoven, 5613LH\n                 Gasfabriek 12, Eindhoven, 5613CP\n                 Gasfabriek 3A, Eindhoven, 5613CP\n                 GASFABRIEK 3C, Eindhoven, 5613CP\n         Lucas Gasselstraat 2a, Eindhoven, 5613LB\n                  Gasfabriek 8, Eindhoven, 5613CP\n             Bilderdijklaan 19, Eindhoven, 5611NG\n                Kleine Berg 62, Eindhoven, 5611JW\n    Clausplein 10A (4th floor), Eindhoven, 5611XP\n                      Piazza 1, Eindhoven, 5611AE\n                Kanaalstraat 4, Eindhoven, 5611CT\n    Kanaalstraat 4 binnenplein, Eindhoven, 5611CT\n             Catharinaplein 21, Eindhoven, 5621BK\n                 Emmasingel 22, Eindhoven, 5611AZ\n            Heuvel Galerie 133, Eindhoven, 5611DK\n                    Heuvel 133, Eindhoven, 5611GP\n              Paradijslaan

In [3]:
# Import things that are needed generically
from langchain.chains import LLMMathChain
from langchain.utilities import SerpAPIWrapper
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, StructuredTool, Tool, tool

In [4]:
llm = ChatOpenAI(temperature=0)

In [16]:
from pydantic import BaseModel, Field
from typing import Optional, Type
from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)

class TestInput(BaseModel):
    question: str = Field()

In [17]:
class CustomTest(BaseTool):
    name = "testing"
    description = "useful for when you need to answer questions about weather"
    args_schema: Type[BaseModel] = TestInput

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        return "hehe, weather is good"

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("Calculator does not support async")

In [20]:
tools = [
    CustomTest(),
]

In [21]:
# Construct the agent. We will use the default agent type here.
# See documentation for a full list of options.
agent = initialize_agent(
    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
)

In [22]:
agent.run(
    "How is the weather like?"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find a way to gather information about the current weather.
Action: testing
Action Input: "weather"[0m
Observation: [36;1m[1;3mhehe, weather is good[0m
Thought:[32;1m[1;3mI now know the current weather is good.
Final Answer: The weather is good.[0m

[1m> Finished chain.[0m


'The weather is good.'

In [1]:
from langchain.agents.agent_toolkits import create_python_agent
from langchain.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL
from langchain.llms.openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI

agent_executor = create_python_agent(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    tool=PythonREPLTool(python_repl=PythonREPL(globals=globals())),
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    agent_executor_kwargs={"handle_parsing_errors": True},
)



In [3]:
agent_executor = create_python_agent(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    tool=PythonREPLTool(),
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    agent_executor_kwargs={"handle_parsing_errors": True},
)

In [4]:
agent_executor.run("What is the 10th fibonacci number?")



[1m> Entering new AgentExecutor chain...[0m


Python REPL can execute arbitrary code. Use with caution.


[32;1m[1;3m
Invoking: `Python_REPL` with `def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

fibonacci(10)`


[0m[36;1m[1;3m[0m[32;1m[1;3mThe 10th Fibonacci number is 55.[0m

[1m> Finished chain.[0m


'The 10th Fibonacci number is 55.'

In [1]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
from dotenv import load_dotenv

load_dotenv()

from langchain.agents import create_csv_agent

In [2]:
agent = create_csv_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
    "new data\\DDW_Location_details_with_services_FINAL.csv",
    verbose=True,
    return_intermediate_steps=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
)

In [3]:
res = agent("how many places have public toilets")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df['Toilets available'].value_counts()"}`


[0m[36;1m[1;3mTrue     98
False    32
Name: Toilets available, dtype: int64[0m[32;1m[1;3mThere are 98 places in the dataframe that have public toilets.[0m

[1m> Finished chain.[0m


In [4]:
res

{'input': 'how many places have public toilets',
 'output': 'There are 98 places in the dataframe that have public toilets.',
 'intermediate_steps': [(_FunctionsAgentAction(tool='python_repl_ast', tool_input={'query': "df['Toilets available'].value_counts()"}, log='\nInvoking: `python_repl_ast` with `{\'query\': "df[\'Toilets available\'].value_counts()"}`\n\n\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'python_repl_ast', 'arguments': '{\n  "query": "df[\'Toilets available\'].value_counts()"\n}'}}, example=False)]),
   True     98
   False    32
   Name: Toilets available, dtype: int64)]}

In [19]:
from langchain.agents import initialize_agent
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMMathChain
from langchain.agents.tools import Tool
from langchain.agents import load_tools
from langchain.agents import create_csv_agent
from langchain.memory import ConversationBufferMemory
import os

from langchain.agents.agent_toolkits import create_python_agent
from langchain.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL
from langchain.llms.openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0, model_name="gpt-3.5-turbo-16k"
)

df = pd.read_csv("new data\\DDW_Location_details_with_services_FINAL.csv")

python_agent = create_python_agent(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    tool=PythonREPLTool(),
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    agent_executor_kwargs={"handle_parsing_errors": True},
)

tools = load_tools(["wikipedia","llm-math"], llm=llm)

from langchain.agents import create_pandas_dataframe_agent

csv_agent = create_pandas_dataframe_agent(
   llm, df, verbose=True 
     )


# csv_agent = create_csv_agent(
#     llm=llm,
#     path="new data\\DDW_Location_details_with_services_FINAL.csv",
#     verbose=True,
#     agent_type=AgentType.OPENAI_FUNCTIONS,
#     pandas_kwargs = {"sep":",", "encoding":'Latin-1'},
#     memory=ConversationBufferMemory()
# )
csv_tool = Tool(
    name="CSV Agent",
    func=csv_agent.run,
    description="Use this tool when the user asks questions about DDW (Dutch Design Week) or any of the linked things. Use this tool when you don't know which tool to use.", # Always make description useful and not generic
)
python_tool = Tool(
    name="Python Agent",
    func=python_agent.run,
    description="Use this tool whenever you need to run the python code", # Always make description useful and not generic
)
tools.extend([csv_tool,python_tool])

agent = initialize_agent(
    llm=llm,
    tools=tools,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    verbose=True,
    handle_parsing_errors=True,
    memory=ConversationBufferMemory(),
)

agent.run('Where is the toilet?')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThis question is not specific enough to determine which tool to use. I will use the CSV Agent to see if there is any information about toilets in the DDW dataset.
Action: CSV Agent
Action Input: "toilet"[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find information about toilets in the dataframe.
Action: I will use the `df[df['Toilets available'] == True]` command to filter the dataframe for rows where the 'Toilets available' column is True.
Action Input: df[df['Toilets available'] == True][0m
Observation: I will use the `df[df['Toilets available'] == True]` command to filter the dataframe for rows where the 'Toilets available' column is True. is not a valid tool, try one of [python_repl_ast].
Thought:[32;1m[1;3mI need to use the `df[df['Toilets available'] == True]` command to filter the dataframe for rows where the 'Toilets available' column is True.
Action: python_repl_ast
Action In

'The specific locations of the toilets at DDW can be found in the DDW dataset.'