In [1]:
from monitorlib import load_pcdata
import pandas as pd

def sync_positions(data,sample,i):
     # Searching for the cursor position that it is nearest to screenshot timestamp ti 
     # (before or after, it does not matter)
     ti = data[sample]['screenshots']['timestamp'][i]
     abs_diff = abs(data[sample]['moves']['timestamp'] - ti)
     closest_index = abs_diff.idxmin()
     px, py = data[sample]['moves']['px'][closest_index], data[sample]['moves']['py'][closest_index]
     return px, py

def sync_MKevents(sample_df, data, sample):
     
     def set_value(sample_df, tdiff, event, position=None):
          # Searching for the screenshot immediately before to the M&K event timestamp ti
          closest_index = abs(tdiff[tdiff <= 0]).idxmin()
          if sample_df['mouse_keyboard'][closest_index] != None:
               print('colision between M&K events', sample_df['mouse_keyboard'][closest_index], event)   # what to do?
               sample_df.loc[closest_index, 'mouse_keyboard'] = event
          else:
               sample_df.loc[closest_index, 'mouse_keyboard'] = event
          if position != None:
               sample_df.loc[closest_index, 'px'] = position[0]
               sample_df.loc[closest_index, 'py'] = position[1]
          
     # Initial values
     sample_df['mouse_keyboard'] = [None for i in range(len(sample_df))]
     timestamps_df = data[sample]['screenshots']['timestamp']

     # Clicks
     for i in range(len(data[sample]['clicks'])):
          ti = data[sample]['clicks']['timestamp'][i]
          button = data[sample]['clicks']['button'][i]
          click_px, click_py = data[sample]['clicks']['px'][i], data[sample]['clicks']['py'][i]
          set_value(sample_df, timestamps_df - ti, button, (click_px,click_py))

     # Scrolls
     for i in range(len(data[sample]['scrolls'])):
          ti = data[sample]['scrolls']['timestamp'][i]
          dy = data[sample]['scrolls']['dy'][i] 
          scroll_px, scroll_py = data[sample]['scrolls']['px'][i], data[sample]['scrolls']['py'][i]
          if dy == 1:
               scroll = 'scroll_down' 
          elif dy == -1:
               scroll = 'scroll_up'
          else:
               scroll = 'what?_scroll'
          set_value(sample_df, timestamps_df - ti, scroll, (scroll_px, scroll_py))
   
     # Keyboard events
     for i in range(len(data[sample]['keys'])):
          ti = data[sample]['keys']['timestamp'][i]
          key = data[sample]['keys']['key'][i].strip("'")
          set_value(sample_df, timestamps_df - ti, key)

def sync(data):
     samples = []
     for sample in data.keys():
          print(f'processing {sample}')
          sample_df = pd.DataFrame()
          sample_df['img_path'] = [img_path for img_path in data[sample]['screenshots']['img_path']]
          
          # Sync cursor positions to screenshots
          positions = [sync_positions(data,sample,i) for i in range(len(data[sample]['screenshots']))]
          sample_df['px'] = [px for px, _ in positions]
          sample_df['py'] = [py for _, py in positions]

          # Sync mouse and keyboard events to screenshots
          sync_MKevents(sample_df, data, sample)

          samples.append(sample_df)
     return samples

data = load_pcdata('data') 

# Syncronization. Order in time: 1) M&K Events, 2) Screenshoots and 3) Cursor positions 
samples = sync(data)

# Replace None values with a string
for sample in samples:
    sample.fillna("no_action",inplace=True)

# tokenization
tokens = set()
for sample in samples:
    tokens.update(sample['mouse_keyboard'].unique().tolist())
tokens = list(tokens)
tokens.sort()
print(len(tokens))
print(tokens)

# Add timestamp column
for sample in samples:
    sample['time'] = sample['img_path'].map(lambda x: float(x.split('/')[2].split('_')[-1].split('.jpg')[0]))

samples[0]

processing sample1
processing sample2
processing sample3
processing sample4
colision between M&K events scroll_down scroll_down
processing sample5
processing sample6
processing sample7
7
['Button.left', 'Key.enter', 'Key.esc', 'no_action', 'o', 'scroll_down', 'z']


Unnamed: 0,img_path,px,py,mouse_keyboard,time
0,data/sample1/0000000000_0.04096198081970215.jpg,570,428,no_action,0.040962
1,data/sample1/0000000001_0.12202715873718262.jpg,570,428,no_action,0.122027
2,data/sample1/0000000002_0.19535470008850098.jpg,570,428,no_action,0.195355
3,data/sample1/0000000003_0.26665711402893066.jpg,570,428,no_action,0.266657
4,data/sample1/0000000004_0.3381388187408447.jpg,570,428,no_action,0.338139
...,...,...,...,...,...
726,data/sample1/0000000726_50.409887075424194.jpg,1360,300,no_action,50.409887
727,data/sample1/0000000727_50.47743463516235.jpg,1360,300,no_action,50.477435
728,data/sample1/0000000728_50.54669976234436.jpg,1360,300,no_action,50.546700
729,data/sample1/0000000729_50.616905212402344.jpg,1360,300,no_action,50.616905


Entendiendo lo que estoy haciendo: 

Tipos de problema: 
* Problema de Aproximación de acciones de M&K 
* Problema de traduccion de instrucciones escritas a acciones de K&M
* Problema de crear un modelo del mundo que aprenda a predecir la dinamica (screen, cursor position, acciones M&K) y usarlo para finetuning de tareas espeficias (un PC-GPT)
* Problema crear una base de datos para
* Problema crear una funcion de valor (mediante RLHF) para evaluar la ejecucion de micro-tareas y usarla para entrenar un agente por RL

1) Aproximar la secuencia de acciones K&M no basta, se requiere generalizar para poder adaptar la politica alos cambios. 
2) Reproducir con codigo la secuencia de acciones colectadas, puede falar si las condiciones como iniciar sesion no son las mismas

No tiene sentido esforzarme por aproximar las acciones.  Mejor enfocate en: 
1) recolecatdar una base de datos SINGULAR para un modelo del mundo de PC
2) reproducir el agente propuesto por 2022 para su benchmak
3) eficientarlo y adapatarlo para mis datos. 
4) Entender ViT con LoRA 

Creo que necesitamos una accion de espera antes de la siguiente accion
y modelo que evalue cuando una accion se ejecuto correctamente. 

IDEA: Aplicar RLHF para aprnder una funcion de vlaor que evalue que tambien se ejecuto una accion-instruccion
Y luego usarla para entrenar un agente con PPO


Lecciones: 
El espacio de acciones de incluir el tiempo de espera (+ coordenadas de clicks)                                                
* La tarea puede ser competlada en menor tiempo que los humanos (y algo independiente del tiempo)
* No necesito grabar todos los screenshoots ni todas las posiciones: podrias quedarme con los estados que correponden a los eventos con acciones. 
* Aproximar la secuencia de acciones no me va a servir para generalizar y generalizar es indispensable para tener una politica robusta a los cambios
Gneralizr impica captar la relacion partes importantes del screenshoot y las acciones y los tiempo de espera. 
* Las apps como zoom cambian un poco su interfaz de usuario inesperadamente, por lo que un programa debe ser capaz de adaptarse.
* El replayer code is very sensible to small changes
* Aun cuando reducir de 700 a 18 steps es una gran estrategia, tambien va a provocar que zonas ciegas en el actuar del modelo. Asi que debo escalar a procesar toda la secuencia evnetualmente. 


Luego:
* Entrenar un modelo que prediga (M&K action (clasif), cursor position (regresion), waiting time (regresion)) from (screenshoots - vae features)
Luego: 
* replicar el articulo de deep mind




In [13]:
import pyautogui
import time

def pcontroller(action, position, delay): 
    # Mouse
    if "Button." in action or "scroll_" in action:
        if action == 'Button.left': 
            pyautogui.moveTo(position[0],position[1])
            pyautogui.click()
        elif action == 'scroll_down': 
            pyautogui.moveTo(position[0],position[1])
            pyautogui.scroll(-1)
    # Keyboard
    elif "Key." in action: 
        if action == 'Key.enter':
            pyautogui.moveTo(position[0],position[1])
            pyautogui.press('enter')
        elif action == 'Key.esc':
            pyautogui.moveTo(position[0],position[1])
            pyautogui.press('esc')
    # Write with keyboard
    elif len(action) == 1 and action.isalpha():
        pyautogui.moveTo(position[0],position[1])
        pyautogui.write(action)
    else:
        print(f'This action {action} does not exist')
    time.sleep(delay)

# Tool for debugging actions
from PIL import Image, ImageDraw, ImageFont

def viz_actions(actions_df, delay=1.0):
    mfont = ImageFont.truetype("arial.ttf", 40)
    kfont = ImageFont.truetype("arial.ttf", 80)
    size = 10
    for index, action in actions_df.iterrows():
        img = Image.open(action['img_path'])
        draw = ImageDraw.Draw(img)
        px = action['px']
        py = action['py']
        if 'Button.' in action['mouse_keyboard']: 
            draw.ellipse([px-size/2,py-size/2,px+size//2,py+size//2], fill="red")
            draw.point((px,py), fill='yellow')
            draw.text((px,py), str(index), fill='red', font=mfont)
        else:
            draw.text((1920//2 - 100,1080//2), action['mouse_keyboard'], fill='red', font=kfont)
        img.show()
        time.sleep(delay)

In [14]:
import numpy as np

sample = samples[0]

# Extract just actions 
actions_df = sample[sample['mouse_keyboard'] != 'no_action'].copy()
delays = actions_df['time'][1:].values - actions_df['time'][0:-1].values
delays = np.append(delays,0.0)
actions_df['delay'] = delays

# Ajusting manually
actions_df.loc[179, 'py'] = 600
actions_df.loc[426, 'px'] = 1100
actions_df.loc[426, 'py'] = 765
actions_df.loc[589, 'px'] = 1025
actions_df.loc[589, 'py'] = 790

# Visualization
#viz_actions(actions_df, delay=3.0)

# Execution
if True:
   time.sleep(2)
   for index, action in actions_df.iterrows():
      pcontroller(action['mouse_keyboard'], (action['px'], action['py']), action['delay'])
   pyautogui.alert('The execution has been terminated.')

In [11]:
actions_df.to_csv('actions_df - sample0.csv', index=False)

In [12]:
pd.read_csv('actions_df - sample0.csv')

Unnamed: 0,img_path,px,py,mouse_keyboard,time,delay
0,data/sample1/0000000020_1.4988057613372803.jpg,28,131,Button.left,1.498806,1.714897
1,data/sample1/0000000044_3.213702917098999.jpg,1754,45,Button.left,3.213703,0.772685
2,data/sample1/0000000055_3.9863882064819336.jpg,1754,45,z,3.986388,0.200026
3,data/sample1/0000000058_4.186413764953613.jpg,1754,45,o,4.186414,0.142454
4,data/sample1/0000000060_4.3288679122924805.jpg,1754,46,o,4.328868,0.567354
5,data/sample1/0000000068_4.89622163772583.jpg,1754,46,Key.enter,4.896222,7.822318
6,data/sample1/0000000179_12.718539237976074.jpg,1040,567,Button.left,12.718539,3.662455
7,data/sample1/0000000232_16.380994081497192.jpg,1075,275,Button.left,16.380994,11.310436
8,data/sample1/0000000393_27.69142985343933.jpg,1017,564,Button.left,27.69143,1.586738
9,data/sample1/0000000415_29.278167486190796.jpg,1080,294,Button.left,29.278167,0.773925
