In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os
#import OpenAIEmbeddings from langchain.embeddings
from langchain.embeddings import OpenAIEmbeddings


In [2]:
# Language: python

import os
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

API_KEY = os.getenv("OPENAI_API_KEY")
if API_KEY is None:
    raise ValueError("API key not found. Please set the OPENAI_API_KEY in the .env file.")

data = pd.read_csv("v2.csv", low_memory=False)
llm = ChatOpenAI(model="gpt-4o-mini")


  llm = ChatOpenAI(model="gpt-4o-mini")


In [6]:
# Language: python

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from typing import Optional

class RAGProcessor:
    def __init__(
        self,
        llm: ChatOpenAI,
        data: pd.DataFrame,
        chat_prompt: ChatPromptTemplate,
        api_key: Optional[str] = None
    ):
        self.llm = llm
        self.data = data
        self.chat_prompt = chat_prompt
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")

        # 1. Convert the DataFrame to text
        text_data = data.to_csv(index=False)
        # 2. Split into smaller chunks using RecursiveCharacterTextSplitter
        splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
        chunks = splitter.split_text(text_data)

        # Optional debug info
        print(f"Number of chunks: {len(chunks)}")
        if len(chunks) > 0:
            print(f"Size of first chunk: {len(chunks[0])}")

        # 3. Create embeddings and store in vector DB
        embeddings = OpenAIEmbeddings(openai_api_key=self.api_key)
        self.db = Chroma.from_texts(chunks, embedding=embeddings)

    def retrieve(self, query: str) -> str:
        # 4. Search the vector store for relevant chunks
        docs = self.db.similarity_search(query, k=3)
        # Combine retrieved chunks
        combined_data = "\n".join([d.page_content for d in docs])
        
        # Ensure the combined size is under 1,000,000 characters
        if len(combined_data) > 1000000:
            combined_data = combined_data[:1000000]
        
        return combined_data

    def run(self, query: str) -> str:
        relevant_data = self.retrieve(query)
        messages = self.chat_prompt.format_messages(data=relevant_data, query=query)
        
        response = self.llm(messages=messages)
        return response.content

# Example usage:

llm = ChatOpenAI(temperature=0)
system_template = SystemMessagePromptTemplate.from_template(
    "You are an analysis machine, you will receive data and then you will answer questions about it:\n{data}\n"
)
user_template = HumanMessagePromptTemplate.from_template("{query}")
chat_prompt = ChatPromptTemplate.from_messages([system_template, user_template])

df = pd.read_csv("v2.csv", low_memory=False)
processor = RAGProcessor(llm, df, chat_prompt)
answer = processor.run("Tell me something about the data.")
print(answer)

KeyboardInterrupt: 

In [2]:
# Language: python

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from typing import Optional
import pandas as pd
import os 

class EventFindingAgent:
    def __init__(
        self,
        llm: ChatOpenAI,
        data: pd.DataFrame,
        chat_prompt: ChatPromptTemplate,
        api_key: Optional[str] = None
    ):
        self.llm = llm
        self.data = data
        self.chat_prompt = chat_prompt
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")

        # 1. Convert the DataFrame to text
        text_data = data.to_csv(index=False)
        # 2. Split into smaller chunks using RecursiveCharacterTextSplitter
        splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
        chunks = splitter.split_text(text_data)

        # Optional debug info
        print(f"Number of chunks: {len(chunks)}")
        if len(chunks) > 0:
            print(f"Size of first chunk: {len(chunks[0])}")

        # 3. Create embeddings and store in vector DB
        embeddings = OpenAIEmbeddings(openai_api_key=self.api_key)
        self.db = Chroma.from_texts(chunks, embedding=embeddings)

    def retrieve(self, query: str) -> str:
        # 4. Search the vector store for relevant chunks
        docs = self.db.similarity_search(query, k=3)
        # Combine retrieved chunks
        combined_data = "\n".join([d.page_content for d in docs])
        
        # Ensure the combined size is under 1,000,000 characters
        if len(combined_data) > 1000000:
            combined_data = combined_data[:1000000]
        
        return combined_data

    def run(self, query: str) -> str:
        relevant_data = self.retrieve(query)
        messages = self.chat_prompt.format_messages(data=relevant_data, query=query)
        
        response = self.llm(messages=messages)
        return response.content

    def assess_risk(self, recent_data: pd.DataFrame) -> str:
        # Convert recent data to text
        recent_text = recent_data.to_csv(index=False)
        # Query the LLM for risk assessment
        risk_query = "Assess the risk for the following data:\n" + recent_text
        return self.run(risk_query)

# Example usage:

llm = ChatOpenAI(temperature=0)
system_template = SystemMessagePromptTemplate.from_template(
    "You are an analysis machine, you will receive data and then you will answer questions about it:\n{data}\n"
)
user_template = HumanMessagePromptTemplate.from_template("{query}")
chat_prompt = ChatPromptTemplate.from_messages([system_template, user_template])

df = pd.read_csv("v2.csv", low_memory=False)
agent = EventFindingAgent(llm, df, chat_prompt)

# Get the most recent 100 frames of data
recent_data = df.tail(100)
assessment = agent.assess_risk(recent_data)
print(assessment)

NameError: name 'os' is not defined

In [28]:

# Example usage:

llm = ChatOpenAI(temperature=0)
system_template = SystemMessagePromptTemplate.from_template(
    "You are an analysis machine, you will receive data and then you will answer questions about it:\n{data}\n"
)
user_template = HumanMessagePromptTemplate.from_template("{query}")
chat_prompt = ChatPromptTemplate.from_messages([system_template, user_template])

df = pd.read_csv("v2.csv", low_memory=False)
processor = RAGProcessor(llm, df, chat_prompt)
answer = processor.run("Tell me something about the data.")
print(answer)

Retrying langchain_community.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Request too large for text-embedding-ada-002 in organization org-GLH3sSgQTbL4bdkNANZVyjSB on tokens per min (TPM): Limit 1000000, Requested 8191000. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more..


RateLimitError: Request too large for text-embedding-ada-002 in organization org-GLH3sSgQTbL4bdkNANZVyjSB on tokens per min (TPM): Limit 1000000, Requested 8191000. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.

In [18]:
# Language: python

from IPython.display import display, Markdown

answer = processor.run("Show me some moments in the data where a vehicle's condition changes drastically, please provide the data for each moment before and after the change occurs and justify/contextualize the changes.")
display(Markdown(answer))

To identify moments where a vehicle's condition changes drastically, we can look for significant changes in metrics such as `gps_speed`, `accData`, `rpm`, or other relevant attributes. Below are two examples where drastic changes are observed:

### Moment 1: Decrease in Speed and Acceleration
1. **Before the Change (Timestamp: 2017-12-22 18:43:07)**
   - `gps_speed`: 18.7052
   - `accData`: 18.7052 (indicating acceleration)
   - `rpm`: 862.25

2. **After the Change (Timestamp: 2017-12-22 18:43:08)**
   - `gps_speed`: 16.4828 (decreased by 2.2224)
   - `accData`: 16.4828 (indicating acceleration)
   - `rpm`: 817.0 (decreased by 45.25)

#### Justification/Contextualization:
The change from a speed of 18.7052 to 16.4828 indicates a significant reduction in the vehicle's speed, which could suggest that the vehicle is decelerating possibly due to a driver reducing throttle, a change in road condition, or approaching a stop. The decrease in RPM from 862.25 to 817.0 supports this observation, indicating that the engine is running at a lower speed, which typically occurs when a vehicle is slowing down.

---

### Moment 2: Sudden Increase in Speed
1. **Before the Change (Timestamp: 2017-12-22 18:43:08)**
   - `gps_speed`: 16.4828
   - `accData`: 16.4828
   - `rpm`: 817.0

2. **After the Change (Timestamp: 2017-12-22 18:43:09)**
   - `gps_speed`: 17.4088 (increased by 0.9260)
   - `accData`: 17.4088
   - `rpm`: 804.25 (decreased by 12.75)

#### Justification/Contextualization:
In this instance, the speed increased from 16.4828 to 17.4088, indicating a slight acceleration. This could mean the driver has increased throttle or is regaining speed after a slowdown. However, the RPM dropped from 817.0 to 804.25, which is somewhat counterintuitive since an increase in speed usually correlates with an increase in RPM. This could suggest that the vehicle is shifting gears, or there could be an issue with the engine performance or transmission.

---

These moments highlight how the vehicle's condition can change drastically based on the data collected, reflecting the driver's actions, road conditions, or mechanical state of the vehicle.

In [20]:
# Language: python

from IPython.display import display, Markdown

answer = processor.run("Find when a vehicle could be most susceptable to a breakdown based on known trends about what conditions lead to a breakdown. If it appears to be stable, find cases where it is MOST at risk.")
display(Markdown(answer))

To assess when a vehicle could be most susceptible to a breakdown based on the provided data, we can analyze several factors that are commonly associated with vehicle breakdowns:

1. **Engine RPM (rpm)**: High RPMs can indicate stress on the engine.
2. **Battery Load (eLoad)**: Higher eLoad values can indicate strain on the electrical system.
3. **Coolant Temperature (cTemp)**: Abnormal temperatures can lead to overheating and breakdown.
4. **GPS Speed (gps_speed)**: Higher speeds could lead to increased stress on vehicle components.
5. **Fuel Efficiency Metrics (kpl)**: Lower fuel efficiency can indicate engine issues.

Let's analyze the data provided:

- **RPM**: The RPM values are all around 815 to 1010, which appear to be within a normal operating range.
- **Battery Load**: All records have an eLoad of 0.0, indicating no significant electrical load, which is not a risk factor.
- **Coolant Temperature**: The cTemp is consistently at 66.0, which is within a normal range.
- **GPS Speed**: The speeds recorded are all low (from 15.0 to 24.26), indicating that the vehicle is likely not under significant stress.
- **Fuel Efficiency (kpl)**: The kpl values are relatively stable, ranging from 15.0 to 23.0, but no significant drop in performance is observed.

### Conclusion:
From the data provided, the vehicle appears to be operating under stable conditions, with no immediate indicators of risk for breakdown. However, to determine cases where the vehicle is MOST at risk, we could define "risk" as low-speed operation with high RPMs or fluctuations in fuel efficiency.

In this dataset, there are no clear indicators of risk based on the factors mentioned. However, if we consider the lowest recorded speeds or RPMs in combination, we find:

- The lowest recorded speed is **15.0** (at multiple timestamps).
- The highest RPM is **1010.75**, which is not excessively high but should be monitored if combined with lower speeds.

Since all conditions appear stable, the vehicle is not indicated to be at significant risk of breakdown based on the provided data. For further analysis, additional data over a longer period, especially under varying conditions (like high load, steep climbs, or extreme temperatures), would be beneficial.

In [15]:
# Language: python

from IPython.display import display, Markdown

answer = processor.run("How does temperature change inside each trip over time?")
display(Markdown(answer))

To analyze how temperature changes inside each trip over time, we can look at the `cTemp` column from the provided data. This column contains the temperature readings at different timestamps during the trip.

Here's a general approach to visualize and understand the temperature changes over time for each trip:

1. **Extract Relevant Data**: For each trip, we would extract the `timeStamp` and `cTemp` values.

2. **Plotting**: A line graph can be created where the x-axis represents the `timeStamp` and the y-axis represents `cTemp`. This will allow us to visualize how the temperature changes throughout the trip.

3. **Analysis**: Look for trends such as increases or decreases in temperature, patterns (e.g., does the temperature rise during specific parts of the trip?), and any significant fluctuations.

4. **Summary Statistics**: Calculate summary statistics such as mean, median, and standard deviation of temperatures for each trip to provide insights into the overall temperature trends.

Since the provided dataset is limited to a single trip (tripID = 1), we can directly analyze the `cTemp` values and their changes across the timestamps given.

Here's a brief analysis based on the provided data:

- **Timestamps and Corresponding Temperatures**:
  - 2017-12-22 18:43:05: 66.0
  - 2017-12-22 18:43:06: 66.0
  - 2017-12-22 18:43:07: 66.0
  - 2017-12-22 18:43:08: 66.0
  - 2017-12-22 18:43:09: 66.0

From the above readings, we can observe that the temperature remained constant at 66.0 degrees throughout this trip.

In conclusion, for this specific trip, there was no change in temperature over time as it remained constant. If there were more data points or multiple trips, we could further analyze trends and changes in temperature.

In [36]:
#get range of values for each column
for col in data.columns:
    print(col, data[col].nunique())

Order_ID 1000
Distance_km 785
Weather 5
Traffic_Level 3
Time_of_Day 4
Vehicle_Type 3
Preparation_Time_min 25
Courier_Experience_yrs 10
Delivery_Time_min 108


Data is small but super clean. lets drop nulls but otherwise we'll leave this

KeyError: "['FuelLevel', 'AlarmOn', 'LastSync', 'Version', 'LastNumber'] not found in axis"