In [1]:
import sys
sys.path.append('../')

In [118]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [119]:
import requests
import json
from dotenv import load_dotenv, find_dotenv
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from src.prompts.options import state_options
from src.prompts.system_prompt import sys_prompt
from src.prompts.user_prompt import examples, user
from src.enums.states import States
import tiktoken

In [120]:
encoder = tiktoken.get_encoding('cl100k_base')

In [121]:
print(sys_prompt)


Goal: Predict if a change of state is warranted when given a series of text.

Context: You will be provided with two items:

1. "Current State" from among the following State Options: ['Trial Start', 'Trial End', 'Delay Start', 'Delay End', 'Mistrial', 'Return to Base']
2. A series of text transmissions which you will use to make the following decisions:
    - Should the "Current State" change given the content/meaning of the text transmission.
    - If the "Current State" should be changed, predict the next state based on the content/meaning of the text transmission.
    - Prediction Rules: Predicted states must adhere to the following structured options:
        Trial Start:	 Next State Options --> ['Trial End', 'Delay Start', 'Mistrial', 'Return to Base']
        Trial End:	 Next State Options --> ['Trial Start', 'Delay Start', 'Mistrial', 'Return to Base']
        Delay Start:	 Next State Options --> ['Delay End', 'Mistrial', 'Return to Base']
        Delay End:	 Next State Option

In [90]:
state = "Trial Start"
text = "Good copy, so all call, all call, trial end."

In [122]:
def build_user_prompt(current_state: str, radio_text: str) -> str:
    user_prompt = user.format(examples=examples, current_state=current_state, text=radio_text)
    return user_prompt

In [123]:
len(encoder.encode(build_user_prompt(state, text) + sys_prompt))

912

In [124]:
_ = load_dotenv(find_dotenv(), override=True)

In [280]:
data_path = '/Users/americanthinker/Downloads/2024-05-14_all_tracks.csv'

df = pd.read_csv(data_path)
df.drop('track2', axis=1, inplace=True)

# df.fillna(value={'track1':'', 'track2':'', 'track3':'', 'track4':''}, inplace=True)

In [141]:
for x in range(1,5):
    if x != 2:
        track = [t for t in df[f'track{x}'].values.tolist() if t]
        print(f'Track {x} length: {len(track)}')

Track 1 length: 1186
Track 3 length: 5437
Track 4 length: 4455


In [149]:
dates = df.loc[:, ['start', 'end']]

In [155]:
dates.start = pd.to_datetime(dates.start)

In [169]:
df['time'] = pd.to_datetime(df.start).dt.time

In [34]:
unique_states = ['Trial Start', 'Delay Start', 'Delay End', 'Trial End', 'Mistrial', 'RTB']

In [56]:
states = df[~df['state'].isna()]
states

Unnamed: 0,start,end,track1,track3,track4,state,notes,delay type
357,5/14/2024 8:42,5/14/2024 8:42,,"All stations, all stations, be advised, trial start, trial start.","All stations, all stations, be advised, trial start, trial start.",Trial Start,,
1524,5/14/2024 9:42,5/14/2024 9:42,,,"Be aware, we do have a fishing vessel maybe 100 yards from our location.",Delay Start,Fishing vessel in the ops area,NPC
1692,5/14/2024 9:50,5/14/2024 9:50,,,They're currently having network problems.,Delay Start,networking problem indicated,Network
2372,5/14/2024 10:23,5/14/2024 10:23,,"So 1015 for trial start, or I'm sorry, 1045 for trial start.",,Delay End,1045 trial start time,Network
2397,5/14/2024 10:27,5/14/2024 10:27,,"Thanks, Daniel.",,Delay End,"fishing vessel left field, no text as PCCU was muted at this time",NPC
3681,5/14/2024 11:48,5/14/2024 11:48,,Trial 211 has begun.,,Trial Start,,
3803,5/14/2024 11:55,5/14/2024 11:55,,,"To be advised, we got a helo flying low over the north corner of the field.",Delay Start,plane flew over part of op area,NPC
3825,5/14/2024 11:56,5/14/2024 11:56,,,"Just to confirm, there's no air vehicles up?",Delay End,,NPC
3916,5/14/2024 12:00,5/14/2024 12:00,,"Trial completion, trial completion.",,Trial End,,
4400,5/14/2024 12:20,5/14/2024 12:20,,Trial 2-2-1 has begun.,,Trial Start,,


In [41]:
labeled_data = states.loc[:,'track1':'state']
labeled_data_cleaned = []
labeled_data_dicts = []
for alist in labeled_data.values:
    cleaned = [text for text in alist if text]
    labeled_data_cleaned.append(cleaned)
for alist in labeled_data_cleaned:
    datum = {'text': ' '.join([t for t in alist[:-1]]), 'label': alist[-1]}
    labeled_data_dicts.append(datum)

In [113]:
ordered_labels = [d['label'] for d in labeled_data_dicts]
ordered_labels

['Trial Start',
 'Delay Start',
 'Delay Start',
 'Delay End',
 'Delay End',
 'Trial Start',
 'Delay Start',
 'Delay End',
 'Trial End',
 'Trial Start',
 'Trial End',
 'Mistrial',
 'Trial Start',
 'Delay Start',
 'Delay End',
 'Trial End',
 'Delay Start',
 'Delay End',
 'Trial Start',
 'Trial End',
 'Mistrial',
 'Delay Start',
 'Delay End',
 'Delay Start',
 'Delay End',
 'RTB']

In [104]:
labeled_data_dicts

[{'text': 'All stations, all stations, be advised, trial start, trial start. All stations, all stations, be advised, trial start, trial start.',
  'label': 'Trial Start'},
 {'text': 'Be aware, we do have a fishing vessel maybe 100 yards from our location.',
  'label': 'Delay Start'},
 {'text': "They're currently having network problems.",
  'label': 'Delay Start'},
 {'text': "So 1015 for trial start, or I'm sorry, 1045 for trial start.",
  'label': 'Delay End'},
 {'text': 'Thanks, Daniel.', 'label': 'Delay End'},
 {'text': 'Trial 211 has begun.', 'label': 'Trial Start'},
 {'text': 'To be advised, we got a helo flying low over the north corner of the field.',
  'label': 'Delay Start'},
 {'text': "Just to confirm, there's no air vehicles up?",
  'label': 'Delay End'},
 {'text': 'Trial completion, trial completion.', 'label': 'Trial End'},
 {'text': 'Trial 2-2-1 has begun.', 'label': 'Trial Start'},
 {'text': 'Good copy, so all call, all call, trial end.',
  'label': 'Trial End'},
 {'text

In [126]:
def format_response(response: requests.models.Response):
    json_response = response.json()
    return json_response['choices'][0]['message']['content'].strip()

In [127]:
def chat_completion(current_state: str,
                    radio_text: str,
                    temperature: float=0.8,
                    max_tokens: int=4096,
                    stream: bool=False,
                    raw: bool=False
                    ) -> str | dict:
    url = os.environ['LEAPFROG_URL']
    api_key = os.environ['LEAPFROG_API_KEY']
    headers = {
    'Authorization': f'Bearer {api_key}',
    'Content-Type': 'application/json'
    }
    data = {
    "model": "vllm",
    "messages": [
        {
            "role": "system",
            "content": sys_prompt
        },
        {
            "role": "user",
            "content": build_user_prompt(current_state, radio_text)
        }
    ],
    "stream": stream,
    "temperature": temperature,
    "max_tokens": max_tokens
}
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        if response.status_code == 200:
            if raw:
                return response.json()
            else: return format_response(response)
        else:
            print('Response is not 200')
            return response
    except Exception as e:
        print(e)

In [128]:
# user_prompt = "Summarize the following document in three sentences or less:\n\n {document}\n\nSummarize:".format(document=data)

In [135]:
response = chat_completion(state, text, max_tokens=250)

In [137]:
response

'{"current_state": Delay End, "predicted_state": Return to Base, "reason": "The text indicates that an order has been given to return to base."}'

In [138]:
for i, data_point in enumerate(labeled_data_dicts[1:]):
    label = data_point['label']
    text = data_point['text']
    initial_state = "Trial Start"
    state = initial_state if i == 0 else ordered_labels[i]
    response = chat_completion(state, text, max_tokens=250)
    print(f'Predicted Response: {response}')
    print(f'Ground Truth: {text, label}')
    print('-'*100)

Predicted Response: {"current_state": "Trial Start", "predicted_state": "Delay Start", "reason": "A fishing vessel is close to the trial area and could be a hazard to safe operations"}</s>
Ground Truth: ('Be aware, we do have a fishing vessel maybe 100 yards from our location.', 'Delay Start')
----------------------------------------------------------------------------------------------------
Predicted Response: {"current_state": Delay Start, "predicted_state": Delay End, "reason": "The text indicates that there are network problems which could delay the start of the trial."}</s>
Ground Truth: ("They're currently having network problems.", 'Delay Start')
----------------------------------------------------------------------------------------------------
Predicted Response: {"current_state": Delay Start, "predicted_state": Trial Start, "reason": "The text indicates that the trial will start at 1045."}</s>
Ground Truth: ("So 1015 for trial start, or I'm sorry, 1045 for trial start.", 'De

In [None]:
data_path = '/Users/americanthinker/Downloads/2024-05-14_all_tracks.csv'

In [324]:
def wrangle_initial_df(data_path: str, 
                       group_column: str='start', 
                       fill_state: bool=True
                       ) -> pd.DataFrame:
    
    df = pd.read_csv(data_path)
    
    #drop track2 which is blank
    df.drop('track2', axis=1, inplace=True)
    
    #replace NaNs with empty string value
    if fill_state:
        df['state'] = df['state'].ffill()
    df.fillna(value='', inplace=True)

    #create minute column for grouping data
    df['minute'] = pd.to_datetime(df[group_column]).dt.time
    df['minute'] = df['minute'].apply(lambda x: x.strftime('%H:%M'))
                       
    #remove uneeded columns
    df.drop(['start', 'end'], axis=1, inplace=True)
    return df

In [329]:
temp = wrangle_initial_df(data_path, fill_state=True)

In [330]:
temp

Unnamed: 0,track1,track3,track4,state,notes,delay type,minute
0,,,Is that independent?,,,,08:26
1,,Yes.,,,,,08:26
2,,,"Oh no, I didn't even hear it.",,,,08:26
3,,"Oh, the NavWar guy, I asked him.",,,,,08:26
4,,,Is there volume on this?,,,,08:26
...,...,...,...,...,...,...,...
10799,,,Yep.,RTB,,,17:52
10800,,,Hot wash time.,RTB,,,17:52
10801,,,Woo.,RTB,,,17:52
10802,,,It's kind of same deal.,RTB,,,17:52


In [331]:
from typing import Any

def fill_entry(grouped_minute: dict, row: list[Any]):
    for i, key in enumerate(grouped_minute):
        grouped_minute[key].append(row[i])

In [332]:
def group_data_by_minute(df: pd.DataFrame, 
                         merge_tracks: bool=False
                        ) -> dict:
    grouped_data = {}
    list_of_data = df.values.tolist()
    for row in list_of_data:
        time = row[-1]
        if time not in grouped_data:
            grouped_data[time] = {column:[] for column in df.columns[:-1]}
            fill_entry(grouped_data[time], row)
        else:
            fill_entry(grouped_data[time], row)
    if merge_tracks:
        for time in grouped_data:
            for num in [1,3,4]:
                grouped_data[time][f'track{num}'] = ' '.join([s for s in grouped_data[time][f'track{num}'] if s])
    return grouped_data

In [337]:
def df_pipe(data_path: str, group_column: str='start', fill_state: bool=True, merge_tracks: bool=False) -> dict:
    df = wrangle_initial_df(data_path, group_column, fill_state)
    grouped_data = group_data_by_minute(df, merge_tracks)
    return grouped_data

In [341]:
data = df_pipe(data_path, merge_tracks=False)

In [342]:
data['11:56']

{'track1': ['',
  "the precautionary to the southwest, barge and tow, ETA to Emerald Bay should be, we'll give it a 14-30 ETA.",
  '',
  '',
  "Okay, we'll cross and turn to them, thank you.",
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  'Traffic, you there?'],
 'track3': ['moving with the scale sometimes, but red looks like both of them now.',
  '',
  '',
  'Cool.',
  '',
  'Cool.',
  '',
  "I'm going to head back up.",
  '',
  '',
  'Yeah.',
  'Cool.',
  '',
  '',
  '',
  '',
  'I thought in wool it used to be circles.',
  'Now they look like little boats.',
  ''],
 'track4': ['',
  '',
  "Do we have anything in the air for this one, or is this just Wambi's?",
  '',
  '',
  '',
  'Do we know?',
  "I think this one is just Wambi's because it's F-sized.",
  'Cool.',
  'Cool, thank you.',
  '',
  '',
  "Just to confirm, there's no air vehicles up?",
  'No.',
  'Cool.',
  "Then I won't call in the helo.",
  '',
  '',
  ''],
 'state': ['Delay Start',
  '

In [344]:
from group_data import df_pipe

In [None]:
test = df