# Imports

In [None]:
!pip install openai >> None

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import openai
from openai import OpenAI
from posixpath import splitext
from tqdm import tqdm
import json

In [None]:
import json
with open('drive/MyDrive/UNICT_ALL_GRAPHS_FOR_DOWNSTREAM.json', 'r') as file:
    master_data = json.load(file)

In [None]:
from google.colab import userdata

api_key = userdata.get('OpenAI_API_key')
openai.api_key = api_key
client = OpenAI(api_key=api_key)

# Define Prompts

In [None]:
SYSTEM_PROMPT_ANTICIPATION_GRAPHS = """ You are assistant which models human behaviour very well.
You'll be provided with a sequence of graphs (1..N-1) describing the actions retrieved from a first-person view video.
Your task is to predict the next graph (N). Make 5 predictions and put most probable prediction first.
Example:
Graph 1: Camera wearer - verb - take; take - direct object - flour; take - from - package; take - with - right hand
Graph 2: Camera wearer - verb - add; add - direct object - flour; add - to - bowl; bowl - with - dough; add - with - right hand
Graph 3: Camera wearer - verb - press; press - direct object - dough; press - with - both hands
Graph 4: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Graph 5: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Predictions:
Graph 6a: Camera wearer - verb - remove; remove - direct object - dough; remove - from - scale; scale - to - bowl
Graph 6b: Camera wearer - verb - weigh; weigh - direct object - dough; weigh - with - scale; dough - on - scale
Graph 6c: Camera wearer - verb - pour; pour - direct object - dough; pour - from - scale; pour - into - mixing bowl
Graph 6d: Camera wearer - verb - pour; pour - direct object - object; pour - into - dough; pour - from - bottle; pour - with - right hand
Graph 6e: Camera wearer - verb - adjust; adjust - direct object - dough; adjust - with - left hand
"""

SYSTEM_PROMPT_ANTICIPATION_GRAPHS_ONE = """ You are assistant which models human behaviour very well.
You'll be provided with a sequence of graphs (1..N-1) describing the actions retrieved from a first-person view video.
Your task is to predict the next graph (N).
Example:
Graph 1: Camera wearer - verb - take; take - direct object - flour; take - from - package; take - with - right hand
Graph 2: Camera wearer - verb - add; add - direct object - flour; add - to - bowl; bowl - with - dough; add - with - right hand
Graph 3: Camera wearer - verb - press; press - direct object - dough; press - with - both hands
Graph 4: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Graph 5: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Prediction:
Graph 6: Camera wearer - verb - remove; remove - direct object - dough; remove - from - scale; scale - to - bowl
"""

SYSTEM_PROMPT_ANTICIPATION_GRAPHS_TEXTS = """ You are a smart assistant which can model human behaviour very well.
You'll be provided with a sequence of actions retrieved from
 a first-person view video. Your task is to understand the general activity and describe it
 in one sentence. Please, provide very general summary and try avoid listing all the "atomic" activities. Example:
Example:
Graph 1: Camera wearer take flour from package with right hand
Graph 2: Camera wearer add flour to bowl with dough with right hand
Graph 3: Camera wearer press dough with both hands
Graph 4: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Graph 5: Camera wearer - verb - move; move - direct object - dough; move - from - bowl; move - to - scale
Predictions:
Graph 6a: Camera wearer - verb - remove; remove - direct object - dough; remove - from - scale; scale - to - bowl; Graph 6b: Camera wearer - verb - weigh; weigh - direct object - dough; weigh - with - scale; dough - on - scale; Graph 6c: Camera wearer - verb - pour; pour - direct object - dough; pour - from - scale; pour - into - mixing bowl; Graph 6d: Camera wearer - verb - pour; pour - direct object - object; pour - into - dough; pour - from - bottle; pour - with - right hand; Graph 6e: Camera wearer - verb - adjust; adjust - direct object - dough; adjust - with - left hand
"""

SYSTEM_PROMPT_ANTICIPATION_ACTIONS_ONE = """ You are assistant which models human behaviour very well.
You'll be provided with a sequence of verb-noun pairs (1..N-1) describing the actions retrieved from a first-person view video.
Your task is to predict the next action (N).
Example:
Action 1: take  flour
Action 2: add flour
Action 3: press dough
Action 4: move dough
Action 5: put dough
Prediction:
Action 6: remove dough
"""

# Prepare requests, define metrics

In [None]:
def sliding_window_sequence(lst, N=20, m=5):
    """
    Iterates through the list lst using a sliding window approach.
    Extracts sequences of length m starting from position N and going backwards.
    Stops at the X-1 element, where X is the length of the list.

    :param lst: List to iterate over.
    :param N: Starting position in the list (0-indexed).
    :param m: Length of the sequence to extract.
    :return: List of extracted sequences.
    """
    X = len(lst)
    sequences = []
    futures = []

    for i in range(N, X):
        if i - m >= 0:  # Check to ensure the window doesn't go out of bounds
            sequence = lst[i - m:i]
            sequences.append(sequence)
            futures.append(lst[i])

    return sequences, futures

# Example usage
lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14]
N = 10  # Starting position
m = 5  # Length of the sequence

extracted_sequences, futures = sliding_window_sequence(lst, N, N)
print("Extracted Sequences:", extracted_sequences, futures)

Extracted Sequences: [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]] [11, 12, 13, 14]


In [None]:
def graph2txt(g):
  rels = g.split(';')
  s = []
  s.append(rels[0].split('-')[0].strip())
  s.append(rels[0].split('-')[-1].strip())
  s.append(rels[1].split('-')[-1].strip())
  for rel in rels[2:]:
    if 'hand' not in rel:
      s.append(rel.split('-')[1].strip())
      s.append(rel.split('-')[2].strip())
  res = ' '.join(s)
  # print(res)
  return res
graph2txt('Camera wearer - verb - adjust; adjust - direct object - switch; adjust - in - box; adjust - with - left_hand; adjust - with - both_hands')

'Camera wearer adjust switch in box'

In [None]:
CNT = 0

for clip_uid, data in tqdm(master_data.items()):
    seq_len = len(master_data[clip_uid]['graphs'])
    graph_split = master_data[clip_uid]['split']
    if len(master_data[clip_uid]['summaries'])>0 and graph_split == 'train' and seq_len>=5:
        CNT+=1
print(CNT)

100%|██████████| 221/221 [00:00<00:00, 196344.25it/s]

80





In [None]:
def calculate_accuracy(gt_actions, predicted_actions):
    """
    Calculates top-1 and top-5 accuracy for verb, noun, and action predictions.

    :param gt_actions: List of ground truth tuples (verb, noun).
    :param predicted_actions: List of lists of predicted tuples [(verb, noun), ...], up to 5 per GT.
    :return: Dictionary with accuracies for verb, noun, and action.
    """
    verb_top1, noun_top1, action_top1 = 0, 0, 0
    verb_top5, noun_top5, action_top5 = 0, 0, 0
    assert len(gt_actions) == len(predicted_actions)
    for gt, preds in zip(gt_actions, predicted_actions):
        gt_verb, gt_noun = gt

        # Check top-1 accuracy
        pred_verb, pred_noun = preds[0]
        if gt_verb == pred_verb:
            verb_top1 += 1
        if gt_noun == pred_noun:
            noun_top1 += 1
        if gt == preds[0]:
            action_top1 += 1

        # Check top-5 accuracy
        verbs, nouns = zip(*preds[:5])
        if gt_verb in verbs:
            verb_top5 += 1
        if gt_noun in nouns:
            noun_top5 += 1
        if gt in preds[:5]:
            action_top5 += 1

    total = len(gt_actions)
    accuracies = {
        # 'verb_top1': verb_top1*100 / total,
        # 'noun_top1': noun_top1*100 / total,
        # 'action_top1': action_top1*100 / total,
        'verb_top5': round(verb_top5*100 / total,2),
        'noun_top5': round(noun_top5*100 / total,2),
        'action_top5': round(action_top5*100 / total,2)
    }

    return accuracies

In [None]:
def process_list(lst):
    # Step 1: Remove all empty strings and strings 'the', 'a'
    filtered_list = [s for s in lst if s and s not in ['the', 'a']]

    # Step 2: Check if the 2nd element is one of ['in', 'up', 'on', 'down'] and merge with the first element
    if len(filtered_list) >= 2 and filtered_list[1] in ['in', 'up', 'on', 'down']:
        filtered_list[0] = ' '.join([filtered_list[0], filtered_list[1]])
        del filtered_list[1]

    # Step 3: Replace '-' with whitespace in elements
    filtered_list = [s.replace('-', ' ') for s in filtered_list]

    return filtered_list

def action2vn(responses):
    kw = ['out', 'on', 'up', 'in', 'off', 'down', '']
    preds = []
    for response in responses:
        pred = []
        for variant in response:
            act = variant[5:].split('\n')[0].lower().strip()
            act = act.split(' ')
            if len(act)>2:
                act = process_list(act)
            try:
                verb = act[0].replace('-', ' ')
                noun = act[1].replace('-', ' ')
            except:
                print(variant)
                if act[0]=='add-water-to-pastry':
                    verb, noun = 'add', 'water'
                elif act[0]=='mix-dough':
                    verb, noun = 'mix', 'dough'
                elif act[0]=='take-pictures':
                    verb, noun ='take', 'NONE'
                elif act[0]=='stand-up':
                    verb, noun = 'stand up', 'NONE'
                elif act[0]=='stand-up':
                    verb, noun ='stand up', 'NONE'
                else:
                    verb, noun ='NONE', 'NONE'

            # edges = variant.split(';')
            # verb = edges[0].split('-')[-1].lower().strip()
            # noun = edges[1].split('-')[-1].lower().strip()
            pred.append([verb,noun])
        preds.append(pred)
    return preds
def graph2vn(responses):
    preds = []
    for response in responses:
        pred = []
        for variant in response:
            edges = variant.split(';')
            verb = edges[0].split('-')[-1].lower().strip()
            try:
                noun = edges[1].split('-')[-1].lower().strip()
            except:
                # print(variant)
                noun = ''
            pred.append([verb,noun])
        preds.append(pred)
    return preds

# Get responses, normalize the formats

In [None]:
import time
def generate_responses(master_data, l, STOP_AFTER = -1, modality = 'Graph', lmax = 20):
    responses = []
    gts = []
    CNT = 0
    mod2prompt = {'Graph': SYSTEM_PROMPT_ANTICIPATION_GRAPHS_ONE, 'Action': SYSTEM_PROMPT_ANTICIPATION_ACTIONS_ONE, 'GraphText': SYSTEM_PROMPT_ANTICIPATION_GRAPHS_TEXTS}
    system_prompt = mod2prompt[modality]
    for clip_uid, data in tqdm(master_data.items()):
        seq_len = len(master_data[clip_uid]['graphs'])
        indices, fut_idx = sliding_window_sequence(list(range(seq_len)), lmax, l)
        if len(indices)>0:
            for inp, fut in zip(indices, fut_idx):
                CNT+=1
                SEQ = ""
                for i in inp:
                    if modality == 'Graph':
                        SEQ += '{} {}: {}\n'.format(modality, i+1, master_data[clip_uid]['graphs'][i][3][0])
                    if modality == 'Action':
                        SEQ += '{} {}: {} {}\n'.format(modality, i+1, master_data[clip_uid]['graphs'][i][3][1], master_data[clip_uid]['graphs'][i][3][2])
                    if modality == 'GraphText':
                        try:
                            graphtext = graph2txt(master_data[clip_uid]['graphs'][i][3][0])
                            SEQ += '{} {}: {}\n'.format('Action', i+1, graphtext)
                        except:
                            pass
                gt = [master_data[clip_uid]['graphs'][fut][3][1], master_data[clip_uid]['graphs'][fut][3][2]]
                gts.append(gt)
                SEQ+='Predictions:\nAction '
                t = time.time()

                completion = client.completions.create(
                    model="text-davinci-003",
                    prompt=system_prompt+'\n'+SEQ,
                    max_tokens = 64,
                    n=5
                )
                response_content = [completion.choices[k].text for k in range(5)]

                responses.append(response_content)
                if CNT%50==0:
                    print(CNT)
                if STOP_AFTER>-1 and CNT==STOP_AFTER:
                    return responses, gts, CNT
    return responses, gts, CNT

In [None]:
lmax = 20
l = 5
modality = 'Graph'
responses,gt,c = generate_responses(master_data, l=l, lmax=lmax, STOP_AFTER=-1, modality=modality)
print('Num of SEQS: ', c)

  1%|          | 2/221 [00:43<1:10:12, 19.23s/it]

50


  4%|▎         | 8/221 [01:37<36:42, 10.34s/it]

100


  4%|▍         | 9/221 [02:38<1:18:46, 22.30s/it]

150


  5%|▌         | 12/221 [03:33<1:11:36, 20.56s/it]

200


  7%|▋         | 16/221 [04:17<48:47, 14.28s/it]

250
300


  8%|▊         | 18/221 [05:42<1:22:22, 24.35s/it]

350


 10%|▉         | 22/221 [07:13<1:04:31, 19.46s/it]

400
450


 12%|█▏        | 27/221 [08:32<54:43, 16.92s/it]  

500


 14%|█▎        | 30/221 [09:50<1:00:54, 19.13s/it]

550


 16%|█▌        | 35/221 [10:57<51:49, 16.72s/it]

600


 16%|█▋        | 36/221 [11:13<50:50, 16.49s/it]

650
700


 17%|█▋        | 37/221 [13:07<1:55:34, 37.69s/it]

750
800
850
900


 19%|█▊        | 41/221 [17:35<2:11:19, 43.77s/it]

950
1000


 19%|█▉        | 43/221 [18:52<2:03:43, 41.71s/it]

1050


 21%|██        | 46/221 [20:28<1:37:43, 33.51s/it]

1100
1150


 22%|██▏       | 48/221 [22:12<1:56:17, 40.34s/it]

1200


 23%|██▎       | 50/221 [23:02<1:36:20, 33.80s/it]

1250


 24%|██▍       | 54/221 [23:28<51:47, 18.61s/it]  

1300
1350


 25%|██▍       | 55/221 [25:35<1:41:06, 36.54s/it]

1400


 27%|██▋       | 59/221 [26:44<1:05:51, 24.39s/it]

1450


 27%|██▋       | 60/221 [27:12<1:07:30, 25.16s/it]

1500


 28%|██▊       | 61/221 [28:28<1:40:03, 37.52s/it]

1550


 29%|██▉       | 64/221 [29:24<1:13:22, 28.04s/it]

1600


 29%|██▉       | 65/221 [29:55<1:14:08, 28.52s/it]

1650


 31%|███       | 68/221 [30:52<1:01:51, 24.26s/it]

1700


 33%|███▎      | 72/221 [32:15<49:07, 19.78s/it]

1750


 33%|███▎      | 73/221 [33:11<1:08:03, 27.59s/it]

1800


 40%|███▉      | 88/221 [33:55<08:24,  3.79s/it]

1850
1900


 43%|████▎     | 95/221 [35:44<13:14,  6.31s/it]

1950


 62%|██████▏   | 136/221 [36:44<01:42,  1.21s/it]

2000


 64%|██████▍   | 141/221 [37:42<06:47,  5.09s/it]

2050


 65%|██████▍   | 143/221 [38:32<14:49, 11.41s/it]

2100


 67%|██████▋   | 148/221 [39:22<10:38,  8.75s/it]

2150


 74%|███████▍  | 163/221 [39:57<02:13,  2.31s/it]

2200
2250


 76%|███████▌  | 168/221 [42:14<10:21, 11.73s/it]

2300


 77%|███████▋  | 171/221 [42:59<10:31, 12.64s/it]

2350
2400


 78%|███████▊  | 173/221 [44:34<18:59, 23.73s/it]

2450


 79%|███████▊  | 174/221 [45:45<27:11, 34.70s/it]

2500


 80%|███████▉  | 176/221 [46:22<21:12, 28.27s/it]

2550


 81%|████████  | 178/221 [46:46<15:59, 22.32s/it]

2600


 82%|████████▏ | 181/221 [48:25<16:23, 24.59s/it]

2650


 84%|████████▍ | 186/221 [49:17<07:49, 13.42s/it]

2700


 86%|████████▌ | 190/221 [50:11<05:55, 11.47s/it]

2750


 87%|████████▋ | 192/221 [50:25<04:44,  9.81s/it]

2800
2850


 88%|████████▊ | 195/221 [52:59<10:55, 25.21s/it]

2900


 92%|█████████▏| 203/221 [53:37<03:03, 10.22s/it]

2950


 96%|█████████▋| 213/221 [54:32<00:55,  6.91s/it]

3000


100%|██████████| 221/221 [55:18<00:00, 15.02s/it]

Num of SEQS:  3030





In [None]:
responses[:20]

[[' 21: Camera wearer - verb - switch; switch - direct object - drill; switch - to - off position; drill - with - left_hand',
  ' 21:Camera wearer - verb - turn; turn - direct object - drill; turn - off - switch; turn - with - left_hand',
  '\n Graph 21: Camera wearer - verb - remove; remove - direct object - wood; remove - from - drill; drill - with - left_hand',
  ' Graph 21: Camera wearer - verb - remove; remove - direct object - drill; remove - from - piece; remove - with - left_hand',
  ' 21: Camera wearer - verb - move; move - direct_object - drill; move_from - wood; move - to - drill box; move - with - left hand'],
 [' Graph 22: Camera wearer - verb - move; move - direct object - drill; move - from - wood; move - to - table',
  '\nGraph 22: Camera wearer - verb - insert; insert - direct object - screw; insert - with - left_hand',
  '\nGraph 22: Camera wearer - verb - replace; replace - direct object - screwdriver; replace - with - left_hand',
  ' Graph 22: Camera wearer - verb -

In [None]:
def process_list(lst):
    # Step 1: Remove all empty strings and strings 'the', 'a'
    filtered_list = [s for s in lst if s and s not in ['the', 'a']]

    # Step 2: Check if the 2nd element is one of ['in', 'up', 'on', 'down'] and merge with the first element
    if len(filtered_list) >= 2 and filtered_list[1] in ['in', 'up', 'on', 'down']:
        filtered_list[0] = ' '.join([filtered_list[0], filtered_list[1]])
        del filtered_list[1]

    # Step 3: Replace '-' with whitespace in elements
    filtered_list = [s.replace('-', ' ') for s in filtered_list]

    return filtered_list

def action2vn(responses):
    kw = ['out', 'on', 'up', 'in', 'off', 'down', '']
    preds = []
    for response in responses:
        pred = []
        for variant in response:
            act = variant[5:].split('\n')[0].lower().strip()
            act = act.split(' ')
            if len(act)>2:
                act = process_list(act)
            try:
                verb = act[0].replace('-', ' ')
                noun = act[1].replace('-', ' ')
            except:
                print(variant)
                if act[0]=='add-water-to-pastry':
                    verb, noun = 'add', 'water'
                elif act[0]=='mix-dough':
                    verb, noun = 'mix', 'dough'
                elif act[0]=='take-pictures':
                    verb, noun ='take', 'NONE'
                elif act[0]=='stand-up':
                    verb, noun = 'stand up', 'NONE'
                elif act[0]=='stand-up':
                    verb, noun ='stand up', 'NONE'
                else:
                    verb, noun ='NONE', 'NONE'

            # edges = variant.split(';')
            # verb = edges[0].split('-')[-1].lower().strip()
            # noun = edges[1].split('-')[-1].lower().strip()
            pred.append([verb,noun])
        preds.append(pred)
    return preds
# responses_normalized_action_short = action2vn(responses)

In [None]:
def graph2vn(responses):
    preds = []
    for response in responses:
        pred = []
        for variant in response:
            edges = variant.split(';')
            verb = edges[0].split('-')[-1].lower().strip()
            try:
                noun = edges[1].split('-')[-1].lower().strip()
            except:
                print(variant)
                noun = ''
            pred.append([verb,noun])
        preds.append(pred)
    return preds

In [None]:
responses_normalized = graph2vn(responses)
print(responses_normalized)


			    -verb-    		Direct Object	    -from/to-	Location
Graph 67: Camera wearer - fold - cloth - from - bed - to - bowl
 52: Camera wearer - verb - move, move - direct object - cloth, move - from - slab, move - to - tap.
 - Verb: Push
Direct Object: Paper
Using: Right Hand
Location: Table
		: Graph 44 
Camera wearer 	: verb - place 
Direct Object 	: card 
Location 	: table 
Object 		: right_hand

 106 Camera wearer - verb - cut, cut - direct object - potato, cut - with - knife
		: Graph 67
Verb 		: Move
Direct Object	: Iron
From		: Right Hand
To		: Bowl

Graph 39: Camera wearer - verb - twist
Direct Object: Steel
With: Wrench
In: Right Hand
			Object 				From/With
Graph 72: 		Cut 				Dough 			Knife
Graph 73: 		Collect 			Crumbs 			Bowl
Graph 74: 		Remove 	
	Direct Object 	Verb 	To/From 	With
Graph 23: Camera wearer 	plug	insert 	socket 	right hand

[[['switch', 'drill'], ['turn', 'drill'], ['remove', 'wood'], ['remove', 'drill'], ['move', 'drill']], [['move', 'drill'], ['insert', 'scr

In [None]:
gts_norm = [[a[0].replace('-',' '), a[1].replace('-',' ')] for a in gt] # replace pick-up to pick up - normalize GT
print(gts_norm[:10])

[['place', 'drill'], ['drill', 'wood'], ['place', 'drill'], ['take', 'wood'], ['put down', 'wood'], ['take', 'wood'], ['take', 'wood'], ['place', 'wood'], ['place', 'wood'], ['drill', 'wood']]


# Evaluation

In [None]:
def calculate_accuracy(gt_actions, predicted_actions):
    """
    Calculates top-1 and top-5 accuracy for verb, noun, and action predictions.

    :param gt_actions: List of ground truth tuples (verb, noun).
    :param predicted_actions: List of lists of predicted tuples [(verb, noun), ...], up to 5 per GT.
    :return: Dictionary with accuracies for verb, noun, and action.
    """
    verb_top1, noun_top1, action_top1 = 0, 0, 0
    verb_top5, noun_top5, action_top5 = 0, 0, 0
    assert len(gt_actions) == len(predicted_actions)
    for gt, preds in zip(gt_actions, predicted_actions):
        gt_verb, gt_noun = gt

        # Check top-1 accuracy
        pred_verb, pred_noun = preds[0]
        if gt_verb == pred_verb:
            verb_top1 += 1
        if gt_noun == pred_noun:
            noun_top1 += 1
        if gt == preds[0]:
            action_top1 += 1

        # Check top-5 accuracy
        verbs, nouns = zip(*preds[:5])
        if gt_verb in verbs:
            verb_top5 += 1
        if gt_noun in nouns:
            noun_top5 += 1
        if gt in preds[:5]:
            action_top5 += 1

    total = len(gt_actions)
    accuracies = {
        'verb_top1': verb_top1*100 / total,
        'noun_top1': noun_top1*100 / total,
        'action_top1': action_top1*100 / total,
        'verb_top5': verb_top5*100 / total,
        'noun_top5': noun_top5*100 / total,
        'action_top5': action_top5*100 / total
    }

    return accuracies

In [None]:
accuracies = calculate_accuracy(gts_norm, responses_normalized)
print("Accuracies Graphs 5:", accuracies)

Accuracies Graphs 5: {'verb_top1': 3.3333333333333335, 'noun_top1': 48.84488448844885, 'action_top1': 1.881188118811881, 'verb_top5': 9.537953795379538, 'noun_top5': 66.03960396039604, 'action_top5': 5.247524752475248}


In [None]:
gts_norm[:3]

[['place', 'drill'], ['drill', 'wood'], ['place', 'drill']]

In [None]:
responses_normalized[:3]

[[('remove', 'drill'),
  ('withdraw', 'drill'),
  ('withdraw', 'drill'),
  ('remove', ''),
  ('withdraw', 'drill')],
 [('remove', 'wood'),
  ('place', 'drill'),
  ('pick', 'screw'),
  ('remove', 'drill'),
  ('fix', 'wood')],
 [('remove', 'drill'),
  ('withdraw', 'drill'),
  ('remove', 'drill'),
  ('withdraw', 'drill'),
  ('withdraw', 'drill')]]