In [2]:
import os
import datetime
import time
import random

import numpy as np
import pandas as pd

import google.generativeai as genai
from google.generativeai import caching

In [4]:
## LIST STEPS, Ingest will alwasys be the first step.  IF NONE or CLOSED, then there is no next step
STEPS = ["INGEST", "PROCESSING", "ASSIGNED", "WORKING", "PENDING_APPROVAL", "APPROVED", "CLOSED", "NONE"]
NEXT, TMIN, TRNG, PROB = "NEXT_STEPS", "TIME_MIN", "TIME_RANGE", "PROBABILITY"

## Build a map of all the STEPS, the next steps possible, time between steps
STEP_MAP = {
    STEPS[0]: {
        NEXT: [STEPS[1], STEPS[7]],
        TMIN: [1,   -1],
        TRNG: [120, -1],
        PROB: [.3,   1],
    },
    STEPS[1]: {
        NEXT: [STEPS[0], STEPS[1], STEPS[2], STEPS[3], STEPS[7]],
        TMIN: [1,        120,      200,      360,      -1],
        TRNG: [1200,     180,      360,      360,      -1],
        PROB: [.1,       .2,       .5,       .8,        1],
    },
    STEPS[2]: {
        NEXT: [STEPS[0], STEPS[1], STEPS[2], STEPS[3], STEPS[7]],
        TMIN: [120,      200,      200,      120,      -1],
        TRNG: [1200,     300,      360,      620,      -1],
        PROB: [.03,      .06,      .09,      .6,        1],
    },
    STEPS[3]: {
        NEXT: [STEPS[0], STEPS[3], STEPS[4], STEPS[5], STEPS[7]],
        TMIN: [120,      300,      600,      1200,     -1],
        TRNG: [1200,     300,      600,      6200,     -1],
        PROB: [.03,      .06,      .9,      .95,        1],
    },
    STEPS[4]: {
        NEXT: [STEPS[0], STEPS[4], STEPS[5], STEPS[6], STEPS[7]],
        TMIN: [120,      120,      480,      1200,     -1],
        TRNG: [1200,     300,      900,      6200,     -1],
        PROB: [.03,      .06,      .9,      .91,        1],
    },
    STEPS[5]: {
        NEXT: [STEPS[0], STEPS[1], STEPS[5], STEPS[6], STEPS[7]],
        TMIN: [120,      120,      30,       1200,     -1],
        TRNG: [1200,     300,      900,      6200,     -1],
        PROB: [.03,      .12,      .15,      .93,        1],
    },
    STEPS[6]: { NEXT: [], TMIN: [], TRNG: [], PROB: [], },
    STEPS[7]: { NEXT: [], TMIN: [], TRNG: [], PROB: [], },
}

def create_example_datasheet(ids=None, timestamps=None):
    if ids is None or timestamps is None:
        raise Exception("Error: need to pass and array of ids and timestamps")
    if len(ids) != len(timestamps):
        raise Exception("Error: the number of ids and timestamps, must be identical")
    
    ## Take the steps below and generate some random data
    id_col, step_col, tmp_col = "id_col", "step_col", "tmp_col"
    data = {
        id_col: [], step_col: [], tmp_col: [],
    }

    ## Loop through all the entries in the df,
    for i in range(len(ids)): #range(5): #
        idx = ids[i]
        tmpstp = timestamps[i]

        ## if there was a transfer2 timestamp
        if tmpstp is not None:
            nxt_step = STEPS[0]
            nxt_tmStp = tmpstp + datetime.timedelta(0, random.randrange(15)) ## Increment our timestamp

            ## While our next_step doesn't equal Close or None
            while nxt_step != STEPS[6] and nxt_step != STEPS[7]:
                data[id_col].append(idx)
                data[step_col].append(nxt_step)
                data[tmp_col].append(nxt_tmStp)

                cur_step = nxt_step
                rnd_vle = random.random()

                for j in range(len(STEP_MAP[nxt_step][PROB])):
                    if rnd_vle <= STEP_MAP[nxt_step][PROB][j]:
                        nxt_step = STEP_MAP[nxt_step][NEXT][j]
                        if nxt_step == STEPS[6] or nxt_step == STEPS[7]:
                            break

                        nxt_tmStp = nxt_tmStp + datetime.timedelta(0, random.randrange(STEP_MAP[nxt_step][TMIN][j], STEP_MAP[nxt_step][TMIN][j] + STEP_MAP[nxt_step][TRNG][j]))
                        break

            if nxt_step == STEPS[6]:
                data[id_col].append(idx)
                data[step_col].append(nxt_step)
                data[tmp_col].append(nxt_tmStp)
    
    
    return pd.DataFrame(data)

In [5]:
## Generate a the timestamp and ids of our datapoints
time_stamps_min, time_stamps_rng = datetime.timedelta(0, -3600 * 24 * 2), 3000
ids, time_stamps = [], []
for i in range(5000):
    ids.append(f"idx_{i}")
    time_stamps.append(datetime.datetime.now() + time_stamps_min + datetime.timedelta(0, random.randrange(time_stamps_rng)))

## Create our example datasheet
df_sys2 = create_example_datasheet(ids, time_stamps)
## Write our dataframe out to a csv
df_sys2.to_csv("sys_log-generated.csv")
df_sys2.head()

Unnamed: 0,id_col,step_col,tmp_col
0,idx_0,INGEST,2024-10-27 17:52:48.378899
1,idx_1,INGEST,2024-10-27 18:01:23.378919
2,idx_1,PROCESSING,2024-10-27 18:06:13.378919
3,idx_2,INGEST,2024-10-27 17:49:39.378923
4,idx_3,INGEST,2024-10-27 17:37:37.378926


### Take our generated system log and load it into gemini

In [7]:
api_secret = "<Put api key here>"
genai.configure(api_key=api_secret)

model = genai.GenerativeModel("gemini-1.5-flash")

In [8]:
myfile = genai.upload_file("./data/01-csv/sys_log-generated.csv")
print(f"{myfile=}")

result = model.generate_content(
    [myfile, "\n\n", "Can you tell me about the contents of the file?"]
)
print(f"{result.text=}")

myfile=genai.File({
    'name': 'files/2rd5s2383ini',
    'display_name': 'sys_log-generated.csv',
    'mime_type': 'text/csv',
    'sha256_hash': 'ZmVlMjE0NjhlZjc4MzEzNGQ2Y2U0NTg0ODBjNmUxYmI2ZGM3MzE4OTU0Y2EyMmIzNjA0ODM1YzM2NjE1ZmFlZg==',
    'size_bytes': '528145',
    'state': 'ACTIVE',
    'uri': 'https://generativelanguage.googleapis.com/v1beta/files/2rd5s2383ini',
    'create_time': '2024-10-29T17:31:00.633503Z',
    'expiration_time': '2024-10-31T17:31:00.559335071Z',
    'update_time': '2024-10-29T17:31:00.633503Z'})
result.text="The file appears to contain data about the processing steps of various items, likely tasks or data entries, identified by unique `id_col` values. Here's a breakdown:\n\n* **id_col**: Represents a unique identifier for each item.  It seems like multiple steps can occur for the same `id_col`, indicating different stages in the processing workflow.\n* **step_col**:  Defines the processing stage each item is currently in.  These stages seem to be:\n    * **

### Cache our task files and create a GenerativeModel

In [9]:
cache = caching.CachedContent.create(
    model='models/gemini-1.5-flash-001',
    display_name='./data/01-csv/sys_log-generated.csv', 
    system_instruction=(
        'Review the file and answer questions regarding the descriptive statitics about it.'
    ),
    contents=[myfile],
    ttl=datetime.timedelta(minutes=5),
)

model = genai.GenerativeModel.from_cached_content(cached_content=cache)

In [10]:
response = model.generate_content([('How many unique ids are there?'), ('How many tasks are completed?'), ('For the completed tasks, what is the average time from ingest to complete?')])
print(response.usage_metadata)
print(" =========================== Results =========================== ")
print(response.text)

prompt_token_count: 442946
candidates_token_count: 290
total_token_count: 443236
cached_content_token_count: 442917

Let's analyze the data:

**1. Unique IDs:**

* There are **501 unique IDs** in the data. 

**2. Completed Tasks:**

* A task is considered completed when it reaches the "CLOSED" status.
* There are **291 completed tasks** in the data.

**3. Average Time to Complete:**

To calculate the average time from ingest to complete, we need to do the following:

1. **Identify the Ingest Timestamp:** For each completed task, find the timestamp when the task entered the "INGEST" state.
2. **Identify the Complete Timestamp:** For each completed task, find the timestamp when the task reached the "CLOSED" state.
3. **Calculate the Duration:** Subtract the ingest timestamp from the complete timestamp to get the duration for each task.
4. **Calculate the Average:** Sum the durations of all completed tasks and divide by the number of completed tasks.

**Unfortunately, we can't directly ca