In [1]:
import firebase_admin
from firebase_admin import firestore
import os
import pandas as pd
from pathlib import Path
import sys
sys.path.append(os.path.dirname(os.getcwd() ))

import visualize


In [2]:
cred = firebase_admin.credentials.Certificate('key.json')
default_app = firebase_admin.initialize_app(cred)
db = firestore.client()
instructions_ref = db.collection('instructions')
verify_ref = db.collection('verification')


In [3]:
list_instruction = list(instructions_ref.get())
list_verification = list(verify_ref.get())
len(list_instruction), len(list_verification)


(3801, 4246)

In [4]:
instructions_ref_sandbox = db.collection('instructions_sandbox')
verify_ref_sandbox = db.collection('verification_sandbox')

list_instruction_sandbox = list(instructions_ref_sandbox.get())
list_verification_sandbox = list(verify_ref_sandbox.get())
len(list_instruction_sandbox), len(list_verification_sandbox)

(52, 317)

In [5]:
# Qualified workers
current_dir = os.getcwd()
parent_dir = Path(current_dir).parent.absolute()
path_qualified = os.path.join(parent_dir, "qualified_workers.csv")

qualified_work_list = pd.read_csv(path_qualified, header=None)[0].tolist()

qualified_work_list

['1',
 'AY7WPVKHVNBLG',
 'A1198W1SPF1R4',
 'A3QAL3X23LHJRV',
 'A3GWRDHAURRNK6',
 'A2DDPSXH2X96RF',
 'ALYR5CI2SM2JC',
 'A320QA9HJFUOZO',
 'APRZ7BR8C0ZMQ',
 'A3862RIFFUV141',
 'A1AKL5YH9NLD2V',
 'A2WX434EAQOE29',
 'AYIFHDQSXQJ6B',
 'A21LONLNBOB8Q',
 'A3UENPLNM9AQBK']

In [19]:
# Set qualified instructions as RVS_excellent  - UPDATES DATA!!!
qualified_instructions = [
  e.to_dict()['key'] for e in list_instruction if (
    e.to_dict()['work_id'] in qualified_work_list) ]

for id in qualified_instructions:
  instructions_ref.document(id).update(
    {'review': 'RVS_excellent'})


In [None]:
# Split instructions to train, dev, test sets.
import json

instruction_dir = "/mnt/hackney/human_data"

def save_split(split_name, split_set):

  split_path = os.path.join(instruction_dir, split_name)
  if os.path.exists(split_path):
    os.remove(split_path)
  with open(split_path, 'a') as outfile:
    for sample in split_set:
      json.dump(sample, outfile, default=lambda o: o.__dict__)
      outfile.write('\n')
      outfile.flush()

  print (f"Size of {split_name} set:{len(split_set)} saved to: {split_path}")


# Save set splits

dev_ids = ['X']
test_ids = ['X']


test_set =  [
  x.to_dict() for x in list_instruction if 'Manhattan' in x.to_dict()['region'] and x.to_dict()['work_id'] in test_ids]

save_split("test.json", test_set)

dev_set =  [
  x.to_dict() for x in list_instruction if 'Manhattan' in x.to_dict()['region'] and x.to_dict()['work_id'] in dev_ids]

save_split("dev.json", dev_set)

train_set = [
  x.to_dict() for x in list_instruction if 'Manhattan' in x.to_dict()['region'] and x.to_dict()['work_id'] not in dev_ids+test_ids]

save_split("train.json", train_set)


In [6]:
# check the new workers ID
df_instruction_manhattan = pd.DataFrame(
    [
        x.to_dict() for x in list_instruction if 'Manhattan' in x.to_dict()['region'] and  'date_start' in x.to_dict() and '2022-11-01' <x.to_dict()['date_start'] ])

df_verification = pd.DataFrame(
        [x.to_dict() for x in list_verification] )


df_manhattan = df_instruction_manhattan.merge(df_verification, left_on='key', right_on='key_instruction')

work_ids = list(set(df_manhattan['work_id_x'].to_list()))


for work_id in work_ids:
    if work_id in qualified_work_list:
        continue
    print (f"--------{work_id}--------")
    print (df_manhattan[df_manhattan['work_id_x']==work_id][['date_start_x', 'review', 'hit_id_x','dist_m']])


--------A1FS8SBR4SDWYG--------
                  date_start_x         review                        hit_id_x  \
97  2022-11-21 16:16:03.310371  RVS_excellent  373L46LKP779OZDA4FAYCEM5VMHKJE   

    dist_m  
97    71.0  
--------AZLZA0Q87TJZO--------
                    date_start_x review                        hit_id_x  \
1103  2022-11-30 14:47:31.865705    NaN  3K8CQCU3KE2W2PYC8RLTTWXHRHBWNE   

      dist_m  
1103     4.0  
--------A39MOCMDRIOSJM--------
                   date_start_x         review  \
544  2022-12-02 11:51:32.243747  RVS_excellent   

                           hit_id_x  dist_m  
544  3FBEFUUYRK6RORPH853K9WE449Y6AK    21.0  
--------A1IQV3QUWRA8G1--------
                  date_start_x review                        hit_id_x  dist_m
98  2022-11-14 03:20:29.155615    NaN  3B6F54KMR2D0H1AEJZDUZ0HTIW5S14    27.0
99  2022-12-11 14:05:48.859196    NaN  3G9UA71JVVVLTK6QLLLP89HJQAR7JM  2537.0


In [7]:
# Check the stack of RVS_excellent
all_data = pd.DataFrame([x.to_dict() for x in list_instruction ])
x = all_data[all_data['review']=='RVS_excellent' ]
x[x['verified_n']==0].shape

(378, 16)

In [8]:
# merge instructions with verification
df_instruction = pd.DataFrame([x.to_dict() for x in list_instruction])
df_ver = pd.DataFrame([x.to_dict() for x in list_verification ])

instruction_ver = df_instruction.merge(df_ver, left_on='key', right_on='key_instruction')


In [9]:
# Valid instruction
dist_threshold = 100


manhattan_instruction_ver = instruction_ver[instruction_ver['region']=='Manhattan']
pittsburgh_instruction_ver = instruction_ver[instruction_ver['region']=='Pittsburgh']


all_valid_instructions = instruction_ver[instruction_ver['dist_m']<dist_threshold]
manhattan_valid_instructions = all_valid_instructions[all_valid_instructions['region']=='Manhattan']
pittsburgh_valid_instructions = all_valid_instructions[all_valid_instructions['region']=='Pittsburgh']


valid_instructions = all_valid_instructions.groupby('key_instruction').min()
# all_instructions = instruction_ver.groupby('key_x').min()
len_instructions = instruction_ver.drop_duplicates(subset=['key_x']).shape[0]
print (f"All valid for theshold {dist_threshold}: {round(100*valid_instructions.shape[0]/len_instructions, 3)}" )


manhattan_valid_instructions = manhattan_valid_instructions.groupby('key_instruction').min()
len_manhattan_instructions = manhattan_instruction_ver.drop_duplicates(subset=['key_x']).shape[0]
print (f"Manhattan valid for theshold {dist_threshold}: {round(100*manhattan_valid_instructions.shape[0]/len_manhattan_instructions, 3)}" )

pittsburgh_valid_instructions = pittsburgh_valid_instructions.groupby('key_instruction').min()
len_pittsburgh_instructions = pittsburgh_instruction_ver.drop_duplicates(subset=['key_x']).shape[0]
print (f"Pittsburgh valid for theshold {dist_threshold}: {round(100*pittsburgh_valid_instructions.shape[0]/len_pittsburgh_instructions, 3)}" )




All valid for theshold 100: 74.338
Manhattan valid for theshold 100: 73.337
Pittsburgh valid for theshold 100: 76.554


In [9]:
# assure all points are of the same format  - UPDATES DATA!!!
import math
import util

list_instruction_all_x = [x.to_dict() for x in list_instruction]
for sample in list_instruction_all_x: 
  for key, point in sample.items():
    FLAG_prob = False
    if not 'point' in key:
      continue
      

    if isinstance(point, list):
      if not(isinstance(point[0], float) and isinstance(point[1], float)):
        FLAG_prob = True
      y = float(point[0])
      x = float(point[1])
      point = [y ,x]
    else:
      FLAG_prob = True
      point = util.point_from_str_point(point)
      y = point.y
      x = point.x
      point = [y, x]
      if not (y>40 and x<-73):
        if x>40 and y<-73:
          point = [x, y]

    assert isinstance(y, float) and isinstance(x, float), print (type(y), type(x))
    assert (isinstance(point, list) and point[0]>40 and point[1]<-73), print(point)

    id = sample['key']
    # instructions_ref.document(id).update(
    #       {key: point})
    if FLAG_prob:
      print (f"Writng task PROB with {key}: {sample['key']} ")


# verify 
list_verification_x = [x.to_dict() for x in list_verification]
for sample in list_verification_x: 
  for key, point in sample.items():
    FLAG_prob = False
    if not 'point' in key:
      continue
    
    if isinstance(point, list):
      if isinstance(point[0], float) and isinstance(point[1], float):
        if (isinstance(point, list) and point[0]>40 and point[1]<-73):
          continue
        else:
          FLAG_prob = True
      else:
        FLAG_prob = True
      y = float(point[0])
      x = float(point[1])
      point = [y ,x]
    else:
      FLAG_prob = True
      point = util.point_from_str_point(point)
      y = point.y
      x = point.x
      point = [y, x]
      if not (y>40 and x<-73):
        if x>40 and y<-73:
          point = [x, y]

    assert isinstance(y, float) and isinstance(x, float), print (type(y), type(x))
    assert (isinstance(point, list) and point[0]>40 and point[1]<-73), print(point)
    
    if 'key' not in sample:
      print (sample['assignmentId'])
      continue
    
    if FLAG_prob:
      print (f"Verify PROB with {key}: {sample['key']} ")
      id = sample['key']
      # verify_ref.document(id).update(
      #       {key: point})

    

In [10]:
import util
import math

# UPDATE!!!
content = "marked in red"

# Check all distances are correct 
for i in range(0,instruction_ver.shape[0]):
  sample = instruction_ver.iloc[i]
  if sample['rvs_goal_point_y']!=sample['rvs_goal_point_x']:
    point_goal_y = util.point_from_list_coord_yx(sample['rvs_goal_point_y'])
    point_goal_x = util.point_from_list_coord_yx(sample['rvs_goal_point_x'])
    dist = round(util.get_distance_between_points(point_goal_y, point_goal_x))
    if dist>1:
      print (f"goal point - key: {sample['key_x']}| {sample['rvs_goal_point_x']}!= {sample['rvs_goal_point_y']}")
    if not isinstance(sample['rvs_goal_point_y'], list) and math.isnan(sample['rvs_goal_point_y']): 
      id = sample['key_y']
      # verify_ref.document(id).update(
      #     {'rvs_goal_point': sample['rvs_goal_point_x']})

 
  if sample['rvs_start_point_x']!=sample['rvs_start_point_y']:
    point_start_y = util.point_from_list_coord_yx(sample['rvs_start_point_y'])
    point_start_x = util.point_from_list_coord_yx(sample['rvs_start_point_x'])

    dist = round(util.get_distance_between_points(point_start_y, point_start_x))

    if dist==0:
      continue
    print (f"start point - key: {sample['key_x']}| {sample['rvs_start_point_x']}!= {sample['rvs_start_point_y']}")

    print (f"dist: {dist}")


list_instruction_all_x = [x.to_dict() for x in list_instruction]
dict_col = {'assignmentId': str, 'content': str, 'date_finish': str, 'date_start': str, 'hit_id': str, 
'work_id': str, 'rvs_sample_number': str, 'rvs_path': str, 'rvs_goal_point': list, 'rvs_start_point': list, 
'task': int, 'key': str, 'valid': bool, 'verified_n': int, 'region': str, 'review': str}
should_be_coulmns = set(dict_col.keys())

for sample in list_instruction_all_x: 
  keys = set(sample.keys())
  difference = keys.difference(should_be_coulmns)
  if len(difference)>0:
    if len(difference)==1 and 'review' in difference:
      continue
    print (difference)
  if content in sample['content']:
    print (sample)

  for key, value in sample.items():
    if not isinstance(value, dict_col[key]):
      print (f"key: {key}. value: {value} {type(value)}")


  
# calc distance

list_verification_x = [x.to_dict() for x in list_verification]
print (list_verification_x[0].keys())
for sample in list_verification_x: 
  prediction = util.point_from_list_coord_yx(sample['predict_goal_point'])
  if 'rvs_goal_point' not in sample:
    if 'key_instruction' in sample:
      rvs_goal_point = df_instruction[df_instruction['key_x']==sample['key_instruction']]['rvs_goal_point_x']
      print ("rvs_goal_point: ", rvs_goal_point)
    else:
      found_matches = df_instruction[
        (
          df_instruction['rvs_path']==str(sample['rvs_path'])) & (
            df_instruction['rvs_sample_number']==str(sample['rvs_sample_number']))]
      assert found_matches.shape[0] <= 1, print(found_matches.shape[0])
      if found_matches.shape[0]==0:
        print (f"No such instruction: {sample['assignmentId']}| {sample['rvs_sample_number']}")
      else:
        print (f"one connection: {found_matches['key'].iloc[0]}")
        id = sample['key']
        # verify_ref.document(id).update(
        #   {'key_instruction': found_matches['key'].iloc[0]})


      
    # print (f"NO rvs_goal_point. \n assignmentId: {sample['assignmentId']}")
    continue 
  point_goal = util.point_from_list_coord_yx(sample['rvs_goal_point'])

  dist_true = round(util.get_distance_between_points(prediction, point_goal))
  if 'dist_m' not in sample:
    print (f"No dist_m. \n assignmentId: {sample['assignmentId']}")
    id = sample['key']
    # verify_ref.document(id).update(
    #       {'dist_m': dist_true})
    continue
  dist_m = sample['dist_m']

  if dist_true != dist_m:
    print (f"dist_true: {dist_true} | dist_m: {dist_m}")



goal point - key: A320QA9HJFUOZO3UQVX1UPFSI75D8YT1UOUDLFGSZ2082| [40.7440722, -73.9746397]!= [40.7559416, -74.0021163]
start point - key: A320QA9HJFUOZO3UQVX1UPFSI75D8YT1UOUDLFGSZ2082| [40.736282972305226, -73.98244241767452]!= [40.747489, -73.9971782]
dist: 1759.0
dict_keys(['predict_goal_point', 'key', 'rvs_goal_point', 'rvs_sample_number', 'key_instruction', 'work_id', 'task', 'assignmentId', 'hit_id', 'date_start', 'dist_m', 'rvs_path', 'date_finish', 'rvs_start_point'])


In [11]:
#Update valid instructions
valid_instructions_tmp = [
  e.to_dict()['key'] for e in list_instruction if (
   e.to_dict()['key'] in all_valid_instructions['key_x'].tolist()) ]

len(valid_instructions_tmp)
for id in valid_instructions_tmp:
  instructions_ref.document(id).update(
    {'valid': True})

In [13]:
#How much was not verified at all
df_instruction[df_instruction['verified_n']==0].shape[0]

692

In [30]:
# Check number of completed assignments per HIT

hit_id = "3G57RS03HH660KUZQ700B8H6X6O25J"
assignments = [
  e.to_dict()['key'] for e in list_instruction if (
   e.to_dict()['hit_id']==hit_id) ]

print(f"number of assignments for hit id {hit_id}: {len(assignments)}")


number of assignments for hit id 3G57RS03HH660KUZQ700B8H6X6O25J: 8


In [34]:
# visualize geosample with instruction

sample = 0
instruction = df_instruction[df_instruction['key']=='A3774HPOUKYTX736GJS3V78VR4MDHREOPJEMUCJINGJF1'].iloc[0]
path_geodata = instruction['rvs_path']
final_path_geodata = os.path.join(parent_dir, path_geodata.replace("/app_instructor/", ""))
map_osm, _, _, _, _ = visualize.get_maps_and_instructions(final_path_geodata)[0]
print (instruction['content'])
map_osm


Go up the street you are on making a left at canal street. When you hit Canal street pharmacy go right. Keep going until you hit the bench at Columbus park.


In [39]:
# Save instructions to data/human
current_dir = os.getcwd()
parent_dir = Path(current_dir).parent.absolute()
path_instructions = os.path.join(parent_dir, "data/human/instructions.json")

print (path_instructions)
all_data.to_json(path_instructions, lines=False)

/home/onlp_gcp_biu/tzuf/cabby/app/data/human/instructions.json
