In [None]:
# pip install -U polars 
# pip install pandas

In [2]:
from datetime import datetime, timedelta
import polars as pl


df_overtime = pl.DataFrame({
    'DATE': pl.date_range(datetime(2023, 1, 1), datetime(2023, 9, 10), timedelta(days=1), time_unit="ms", eager=True),
    "EMPLOYEE_ID": 1000,
    "MON" : 6,
    "TUE" : 6,
    "WED" : 6,
    "THU" : 6,
    "FRI" : 6
})
df_overtime

DATE,EMPLOYEE_ID,MON,TUE,WED,THU,FRI
datetime[ms],i64,i64,i64,i64,i64,i64
2023-01-01 00:00:00,1000,6,6,6,6,6
2023-01-02 00:00:00,1000,6,6,6,6,6
2023-01-03 00:00:00,1000,6,6,6,6,6
2023-01-04 00:00:00,1000,6,6,6,6,6
2023-01-05 00:00:00,1000,6,6,6,6,6
2023-01-06 00:00:00,1000,6,6,6,6,6
2023-01-07 00:00:00,1000,6,6,6,6,6
2023-01-08 00:00:00,1000,6,6,6,6,6
2023-01-09 00:00:00,1000,6,6,6,6,6
2023-01-10 00:00:00,1000,6,6,6,6,6


In [7]:
for i in range(1001,1501,1):
    overtime_temp = pl.DataFrame({
    'DATE': pl.date_range(datetime(2023, 1, 1), datetime(2023, 9, 10), timedelta(days=1), time_unit="ms", eager=True),
    "EMPLOYEE_ID": i,
    "MON" : 6,
    "TUE" : 6,
    "WED" : 6,
    "THU" : 6,
    "FRI" : 6
    })
    df_overtime = pl.concat(
    [
        df_overtime,
        overtime_temp,
    ],
    how="vertical",
    )


df_overtime.shape

(632753, 7)

In [10]:
## collect standard stats from all dataset

# df_overtime.describe()

dayCols = ["MON","TUE","WED","THU","FRI"]
## average, min, max and total hours per week in dataset
df_overtimeStats = df_overtime.describe().select(
        pl.col("*").exclude(
            "DATE","EMPLOYEE_ID")).filter(
                (pl.col("describe") == "mean") 
                | (pl.col("describe") == "max") 
                | (pl.col("describe") == "min")
        ).select([
        pl.col("*"),
        pl.sum_horizontal(dayCols).alias('total')
    ])
# df_overtimeStats.item(0,6)
df_overtimeStats.head()

describe,MON,TUE,WED,THU,FRI,total
str,f64,f64,f64,f64,f64,f64
"""mean""",6.0,6.0,6.0,6.0,6.0,30.0
"""min""",6.0,6.0,6.0,6.0,6.0,30.0
"""max""",6.0,6.0,6.0,6.0,6.0,30.0


In [None]:
# getting latest stats by employee
single_employee_stats = df_overtime.filter((pl.col("EMPLOYEE_ID") == 1001) & (pl.col("DATE") == datetime(2023, 9, 10)))
single_employee_stats

In [13]:
# let's assume, today's overtime sheet is submitted by all employees
# and some of the employee accidentally booked 8 hours in a day

for i in range(1001,1501,1):
    overtime_temp = pl.DataFrame({
    'DATE': pl.date_range(datetime(2023, 9, 11), datetime(2023, 9, 12), timedelta(days=1), time_unit="ms", eager=True),
    "EMPLOYEE_ID": i,
    "MON" : 8,
    "TUE" : 6,
    "WED" : 6,
    "THU" : 6,
    "FRI" : 6
    })
    df_overtime = pl.concat(
    [
        df_overtime,
        overtime_temp,
    ],
    how="vertical",
    )

df_overtime.shape

(633753, 7)

In [14]:
# getting latest stats by employee
out = df_overtime.filter((pl.col("EMPLOYEE_ID") == 1001) & (pl.col("DATE") == datetime(2023, 9, 11)))
out

DATE,EMPLOYEE_ID,MON,TUE,WED,THU,FRI
datetime[ms],i64,i64,i64,i64,i64,i64
2023-09-11 00:00:00,1001,8,6,6,6,6


In [15]:
from decouple import config
import together
from langchain import PromptTemplate, LLMChain
from typing import Any, Dict, List, Mapping, Optional
#pydantic model version>2 will not work with langchain. you should install a model version less than 2
from pydantic import Extra, Field, root_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env
import re

together.api_key = config("TOGETHER_API_KEY")

together.Models.start("togethercomputer/llama-2-70b-chat")

class TogetherLLM(LLM):
    """Together large language models."""

    model = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key = config("TOGETHER_API_KEY")
    """Together API key"""

    temperature = 0.7
    """What sampling temperature to use."""

    max_tokens = 1024
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self):
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt,
        **kwargs: Any,
    ):
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        # Use regex substitution to remove newlines
        #cleaned_text = re.sub(r"\n", "", text)
        return text

tog_llm = TogetherLLM(
    model= "togethercomputer/llama-2-70b-chat",
    temperature=0.1,
    max_tokens=1024)

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""


def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [21]:
out = df_overtime.filter((pl.col("EMPLOYEE_ID") == 1001) & (pl.col("DATE") == datetime(2023, 9, 11)))


out.head()

DATE,EMPLOYEE_ID,MON,TUE,WED,THU,FRI
datetime[ms],i64,i64,i64,i64,i64,i64
2023-09-11 00:00:00,1001,8,6,6,6,6


In [22]:
text = "Is there any anomaly in the data. Average employee overtime hours are " + str(df_overtimeStats.item(0,6)) + " hours per week. This week, I charged total " + str(out.item(0,2) + out.item(0,3) + out.item(0,4) + out.item(0,5) + out.item(0,6)) + " overtime hours. I overtimed " +str(out.item(0,2)) + ","+str(out.item(0,3)) + ","+str(out.item(0,4)) + ","+str(out.item(0,5)) + ","+str(out.item(0,6)) + ", hours per day from Monday to Friday."

In [24]:
system_prompt = "You are an advanced assistant who is very good at logical thinking and mathematical explanation. "
instruction = "Answer the question logically that is asked in the text : \n\n {text}"
template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt= prompt, llm = tog_llm )

output = llm_chain.run(text)

print(output)

[INST]<<SYS>>
You are an advanced assistant who is very good at logical thinking and mathematical explanation. 
<</SYS>>

Answer the question logically that is asked in the text : 

 {text}[/INST]
 There appears to be an anomaly in the data.

The average employee overtime hours are 30.0 hours per week, but this week, 32 overtime hours were charged. This is an increase of 2 hours from the average, which could be considered an anomaly.

Furthermore, the overtime hours per day from Monday to Friday are not consistent. The hours per day are 8, 6, 6, 6, 6, which adds up to 32 hours, exceeding the average weekly overtime hours. It's unlikely that an employee would work the same number of overtime hours every day, which could indicate an error in recording or reporting the data.

Therefore, it's reasonable to conclude that there is an anomaly in the data, and further investigation is necessary to determine the cause and correct any errors.
