In [None]:
########## Parse all the text and ann files from Brat output and store the results in a pickle file to speed-up dataset creation
########## For details of functions, refer to Bert Dataset v1 notebook

In [1]:
import csv, requests, json, collections, os
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
from numpy.random import randint
regex = re.compile('[^a-zA-Z0-9-_]')
import xlwt 
from xlwt import Workbook
import pandas as pd
from os import listdir
from os.path import isfile, join
# from nltk.tokenize import word_tokenize
# from nltk import pos_tag
import stanza
import pickle
nlp = stanza.Pipeline('en',verbose=False, processors='tokenize,pos,lemma')

In [2]:
category_properties = {}
category_properties['Hardware-Devices'] = ['notRelevant','isPowered','isConnected','isSetup','isUsed']
category_properties['Software-Device-Drivers'] = ['notRelevant','isInstalled','isRelatedDeviceConnected','isSettingsChanged']
category_properties['Software-OS-Related'] = ['notRelevant','isOpened','isSettingsChanged']
category_properties['Software-Other'] = ['notRelevant','isInstalled','isOpened','isSettingsChanged']
category_properties['Hardware-Other'] = ['']

property_event = {}
property_event['notRelevant'] = True
property_event['isPowered'] = False
property_event['isConnected'] = False
property_event['isSetup'] = True
property_event['isUsed'] = True
property_event['isInstalled'] = False
property_event['isRelatedDeviceConnected'] = False
property_event['isSettingsChanged'] = True
property_event['isOpened'] = False

property_values = {}
property_values[True] = ['true','false']
property_values[False] = ['f->t','t->f','noChange','start-T','start-F']

In [3]:
step_details = []
mentions = {}
entities = {}
ent_variants = {}
ent_verb_spans = {}
merged_ents = {}
step_wise_properties = {}

all_annotations = {}

In [4]:
def get_line(offset):
    for i, step in enumerate(step_details):
        if(int(offset)<step[2]):
            return i-2
    return len(step_details)-1
def clean_text(ent_text):
    tokens=nlp(ent_text)
    tokens = [w for sent in tokens.sentences for w in sent.words]
    lemmas = [token.lemma.lower().strip() for token in tokens]
    ent_text = ' '.join(lemmas)
    return ent_text
def get_text(mention):
    text = mentions[mention][-2]
    while(entities[text][0] != '' and entities[text][0] != text):
        text = entities[text][0]
    return text
def get_base_ent(text):
    while(entities[text][0] != '' and entities[text][0] != text):
        text = entities[text][0]
    return text
def state_change(new_state, prev_state):
    if(new_state==prev_state):
        return 'None'
    elif(new_state=='True'):
        return 'True'
    elif(new_state=='False'):
        return 'False'
    else:
        return 'None'
def get_indexes(sent_tokens, ent_text,verb=False):
    indexes = ['0' for _ in sent_tokens]
    if verb:
        indexes = ['1' if token.pos=='VERB' else '0' for token in sent_tokens]
    else:
        for ent in ent_variants[ent_text]:
            ent_tokens = nlp(ent)
            ent_tokens = [w for sent in ent_tokens.sentences for w in sent.words]
            for i in range(len(sent_tokens)-len(ent_tokens)+1):
                if(all(ent_tokens[j].lemma.lower().strip()==sent_tokens[i+j].lemma.lower().strip() for j in range(len(ent_tokens)))):
                    for j in range(len(ent_tokens)):
                        indexes[i+j]='1'
    return indexes
def process_sent(sent:str,ent_text:str):
    tokens=nlp(sent)
    tokens = [w for sent in tokens.sentences for w in sent.words]
    lemmas = [token.lemma.lower().strip() for token in tokens]
    ent_spans = get_indexes(tokens,ent_text)
    verb_spans = get_indexes(tokens, ent_text, verb=True)
    return [lemmas,ent_spans,verb_spans]

In [5]:
def parse_annotations(out_file,doc_id=1,folder='config/',file='monitor',filter_prop=''):
    global entities, mentions, step_details, ent_variants, all_annotations
    try:
        config_file = folder+file+'.ann'
        text_file = folder+file+'.txt'
        annotations = open(config_file,'r',encoding='utf-8').readlines()
        raw_text = open(text_file,'r',encoding='utf-8').read().strip()
        steps = raw_text.split('\n\n')

        step_details = []
        start = 0
        start2 = 0
        for i, step in enumerate(steps):
            length = len(step)
            newlines = step.count('\n')
            step_details.append([i,step,start,start2,length,newlines])
            start+=length+newlines+4 ### accounting for '\n' diff in Brat
            start2+=2+length
        #     if(i==3):
        #         print(step,'\n\n',newlines)
        num_steps = len(step_details) - 1

        entities = {} #base-entity, mentions
        mentions = {} #ent_id, line_no, ent_cat, start, end, ent_text, properties
        for line in annotations:
            line=line.strip()
            if(line[0]=='T'): ### entities
                ent_id, ent_cat, ent_text = line.split('\t')
                ent_text = clean_text(ent_text)
                ent_cat, start, end = ent_cat.split(' ')
                line_no = get_line(start)
                start,end,line_no = int(start),int(end),int(line_no)
                mentions[ent_id] = [ent_id, line_no,ent_cat,start,end,ent_text,{}]
                if(ent_text not in entities):
                    entities[ent_text]=['',[]]
                entities[ent_text][1].append(ent_id)
        #         print(ent_id,ent_cat,start,end,line_no,ent_text,sep='\n')
        #         break
            elif(line[0]=='A'): ### attributes
                atr_id, atr_type = line.split('\t')
                split = atr_type.split(' ')
                if(len(split)==2):
                    split.append('true')
                atr_type, atr_ent, atr_val = split
                mentions[atr_ent][-1][atr_type]=atr_val
        #         print(atr_id,atr_type, atr_ent, atr_val,sep='\n')
        #         break
            elif(line[0]=='*'): ### equiv
                _, rel_type = line.split('\t')
                rels = rel_type.split(' ')
                rel_type, ent1, ent2 = rels[0], rels[1], rels[2]
                text1, text2 = get_text(ent1), get_text(ent2)
                if(text1==text2):
                    continue
                base, other = text2, text1
                if(len(text1)<=len(text2)):
                    base, other = text1, text2
                entities[other][0]=base
        #         print(entities[other][0],base,sep=',')
            elif(line[0]=='R'): ### relation- subpart
                rel_id, rel_type = line.split('\t')
                rel_type, sub, main = rel_type.split(' ')
                sub, main = sub.split(':')[1],main.split(':')[1]
                sub, main = get_text(sub), get_text(main)
                entities[sub][0] = main
        #         print(rel_id,rel_type,sub,main,sep=',\t')
            else:
                print(line)

        merged_ents = {} ### merge entities with base-subpart-equiv
        ent_variants = {}
        for ent in entities:
            base_ent = get_base_ent(ent)
            if(base_ent not in merged_ents):
                merged_ents[base_ent] = {}
                ent_variants[base_ent] = set()
            ent_variants[base_ent].add(ent)
            for mention_id in entities[ent][1]:
                mention = mentions[mention_id]
                line_no = mention[1]
                if(line_no not in merged_ents[base_ent]):
                    merged_ents[base_ent][line_no] = mention
                else:
                    merged_ents[base_ent][line_no][-1].update(mention[-1])
    #     print(ent_variants)

        clean_ents = []
        for ent in merged_ents:
            if(all(merged_ents[ent][line_no][-1]=={} for line_no in merged_ents[ent])):
                clean_ents.append(ent)
        for ent in clean_ents:
            del merged_ents[ent]

        step_wise_properties = {}

        for ent in merged_ents:
            mention = mentions[entities[ent][1][0]]
            category = mention[2]
            if(category == 'Hardware-Other'):
                continue
            properties = category_properties[category]
            ent_properties = {}
            for prop in properties:
                prop_val = ['']*(num_steps+1)
                prop_val[0]='False'
        #         print(sorted(merged_ents[ent]))
        #         break
        #         ent_mentions = [mentio]
        #         ent_mentions = sorted(merged_ents[ent])
                if(property_event[prop]):
                    for step_no in range(0,num_steps):
                        if(step_no in merged_ents[ent] and prop in merged_ents[ent][step_no][-1]):
                            val = merged_ents[ent][step_no][-1][prop].capitalize()
                            prop_val[step_no+1] = val
                        else:
                            prop_val[step_no+1] = 'False'
                else:
                    for step_no in range(0,num_steps):
                        if(step_no in merged_ents[ent] and prop in merged_ents[ent][step_no][-1]):
                            val = merged_ents[ent][step_no][-1][prop]
                            if(val=='f->t' and prop_val[step_no]=='False'):
                                prop_val[step_no+1] = 'True'
                            elif(val=='t->f' and prop_val[step_no]=='True'):
                                prop_val[step_no+1] = 'False'
                            elif(val=='t->f' and all(val=='False' for val in prop_val[:step_no])):
                                for i in range(step_no+1):
                                    prop_val[i] = 'True'
                                prop_val[step_no+1] = 'False'
                            elif(val=='start-T'):
                                for i in range(step_no+2):
                                    prop_val[step_no+1] = 'True'
                            elif(val=='start-F'):
                                for i in range(step_no+2):
                                    prop_val[step_no+1] = 'False'
                            else:
                                prop_val[step_no+1] = prop_val[step_no]
                        else:
                            prop_val[step_no+1] = prop_val[step_no]
                if not((filter_prop != '' and prop!=filter_prop) or all(val=='False' for val in prop_val)):
                    ent_properties[prop]=prop_val
        #     print(ent,category,ent_properties,sep='\n')
        #     break
            step_wise_properties[ent]=ent_properties
        # print(step_wise_properties)

#         df = pd.DataFrame({'Steps':steps})
#         for ent in step_wise_properties:
#             for prop in step_wise_properties[ent]:
#                 if(filter_prop==''):
#                     df[ent+'___'+prop] = step_wise_properties[ent][prop]
#                 else:
#                     df[ent] = step_wise_properties[ent][prop]

#         headers = list(df.columns)[1:]
    #     print(df['Steps'])
        ent_verb_spans = {}
        for ent in step_wise_properties:
            ent_verb_spans[ent]={}
            for prop in step_wise_properties[ent]:
                for step_no in range(len(steps)-1):
                    sent_lemmas, ent_spans, verb_spans = process_sent(steps[step_no+1],ent)
                    if(step_no not in ent_verb_spans[ent]):
                        ent_verb_spans[ent][step_no] = (sent_lemmas, ent_spans, verb_spans)
#                     if(all(ent_tag=='0' for ent_tag in ent_spans)):
#                         continue;
#     #                 print(sent_lemmas, ent_spans, verb_spans)
#                     print('####'.join(sent_lemmas),','.join(verb_spans),','.join(ent_spans),state_change(step_wise_properties[ent][prop][step_no+1],step_wise_properties[ent][prop][step_no]),sep='\t',file=out_file)
        all_annotations[file] = {}
        all_annotations[file]['steps'] = steps
        all_annotations[file]['step_details'] = step_details
        all_annotations[file]['mentions'] = mentions
        all_annotations[file]['entities'] = entities
        all_annotations[file]['ent_variants'] = ent_variants
        all_annotations[file]['ent_verb_spans'] = ent_verb_spans
        all_annotations[file]['merged_ents'] = merged_ents
        all_annotations[file]['step_wise_properties'] = step_wise_properties
    except Exception as e:
        print(e)

In [6]:
# parse_annotations()
# f.close()

In [7]:
def parse_prop(filter_prop='isOpened'):
#     filter_prop = 'isOpened'
    train_f = open('./annotations/prolocal_annotations_'+filter_prop+'_train.tsv','w',encoding='utf-8')
    dev_f = open('./annotations/prolocal_annotations_'+filter_prop+'_dev.tsv','w',encoding='utf-8')
    test_f = open('./annotations/prolocal_annotations_'+filter_prop+'_test.tsv','w',encoding='utf-8')
    data_folder = './annotations/'
    count = 1
    for folder in [fol for fol in listdir(data_folder) if not isfile(join(data_folder, fol))]:
        folder = data_folder + folder+'/'
        onlyfiles = [f[:-4] for f in listdir(folder) if f.endswith('.txt')]
        for file in onlyfiles:
            rand = randint(0,100)
            print(count, folder, file,rand)
            if(rand<80):
                parse_annotations(out_file=train_f,doc_id=count, folder=folder, file=file, filter_prop=filter_prop)
            elif(rand<90):
                parse_annotations(out_file=dev_f,doc_id=count, folder=folder, file=file, filter_prop=filter_prop)
            else:
                parse_annotations(out_file=test_f,doc_id=count, folder=folder, file=file, filter_prop=filter_prop)
            count+=1
    train_f.close()
    dev_f.close()
    test_f.close()

In [8]:
parse_prop('')

1 ./annotations/Controllers/ 199_1___Test_Your_PS4_Controller___Steps 79
2 ./annotations/Controllers/ 200_1___Connect_an_Xbox_One_Controller_to_an_Xbox_One___Connecting_Wirelessly 70
3 ./annotations/Controllers/ 201_1___Disable_the_Green_Light_on_a_Thrustmaster_T.16000M_Joystick___Steps 37
4 ./annotations/Controllers/ 202_1___Hack_an_Xbox_Controller_Into_a_PC_Gamepad___Steps 3
5 ./annotations/Controllers/ 203_1___Connect_a_Razer_Controller_to_a_PC___Connecting_Wirelessly 13
6 ./annotations/Controllers/ 204_1___Use_the_Razer_Hydra___Installing_the_Hydra 52
7 ./annotations/Controllers/ 204_2___Use_the_Razer_Hydra___Using_the_Hydra 50
8 ./annotations/Headphones/ 161_1___Repair_Dodgy_or_Broken_Headphones___Finding_the_Problem 23
9 ./annotations/Headphones/ 161_2___Repair_Dodgy_or_Broken_Headphones___Fixing_the_Cable 28
10 ./annotations/Headphones/ 161_3___Repair_Dodgy_or_Broken_Headphones___Fixing_a_Broken_Plug 19
11 ./annotations/Headphones/ 162_1___Fix_Earbuds___Soldering_a_Broken_Connec

89 ./annotations/Keyboards/ 134_1___Unlock_a_Keypad___Unlocking_Keypads_on_Mac_OS_X 80
90 ./annotations/Keyboards/ 135_1___Clean_Under_Laptop_Keyboard_Keys___Putty 31
91 ./annotations/Keyboards/ 137_1___Cut_the_Number_Pad_off_a_Keyboard___Steps 88
92 ./annotations/Keyboards/ 138_1___Bind_a_Razer_Keyboard___Creating_the_key_bind 99
93 ./annotations/Keyboards/ 139_1___Troubleshoot_Mouse_Problems_with_the_Keyboard___Steps 27
94 ./annotations/Keyboards/ 140_1___Edit_Gboard_Keyboard_Settings___Using_the_Gboard_App 88
95 ./annotations/Keyboards/ 140_2___Edit_Gboard_Keyboard_Settings___Changing_Keyboard_Order_and_Text_Replacement 11
96 ./annotations/Keyboards/ 141_1___Clean_a_Keyboard_in_a_Dishwasher___Steps 13
97 ./annotations/Keyboards/ 142_1___Change_the_Default_Numlock_State___Steps 72
98 ./annotations/Keyboards/ 143_1___Start_Screensaver_with_a_Keyboard_Shortcut_on_Mac___Creating_the_Action 88
99 ./annotations/Keyboards/ 143_2___Start_Screensaver_with_a_Keyboard_Shortcut_on_Mac___Creatin

182 ./annotations/Monitors/ 11_1___Add_an_Additional_Monitor_to_Your_Computer___Steps 74
183 ./annotations/Monitors/ 12_1___Degauss_a_Computer_Monitor___Soldering_Gun_Method 59
184 ./annotations/Monitors/ 12_2___Degauss_a_Computer_Monitor___Rigged_Drill_Method 20
185 ./annotations/Monitors/ 13_1___Discharge_a_CRT_Monitor___Steps 92
186 ./annotations/Monitors/ 14_1___Make_a_Monitor_256_Color___Program_Specific_Change 20
187 ./annotations/Monitors/ 15_1___Improve_Image_Quality_on_an_LCD_Monitor___Steps 83
188 ./annotations/Monitors/ 16_1___Clean_a_Flat_Panel_Monitor___Steps 16
189 ./annotations/Monitors/ 17_1___Extend_a_Netbook_to_an_External_Monitor___Steps 79
190 ./annotations/Monitors/ 1_1___Connect_a_Laptop_to_a_Monitor___Connecting_the_Monitor 85
191 ./annotations/Monitors/ 1_2___Connect_a_Laptop_to_a_Monitor___Detecting_a_Display_in_Windows 18
192 ./annotations/Monitors/ 1_3___Connect_a_Laptop_to_a_Monitor___Detecting_a_Display_in_MacOS 26
193 ./annotations/Monitors/ 1_4___Connect_

275 ./annotations/OSX/ 1152_1___Show_Hidden_Files_and_Folders_on_a_Mac___Showing_Hidden_Files 22
276 ./annotations/OSX/ 1152_2___Show_Hidden_Files_and_Folders_on_a_Mac___Making_Hidden_Files_Visible 88
277 ./annotations/OSX/ 1153_1___Resize_Pictures_for_Macs___Resizing_an_Image_in_Preview 30
278 ./annotations/OSX/ 1153_2___Resize_Pictures_for_Macs___Cropping_an_Image_in_Preview 39
279 ./annotations/OSX/ 1154_1___Stop_an_Application_from_Opening_at_Startup_With_Mac_OS_X___Steps 30
280 ./annotations/Printers/ 18_1___Scan_a_Document_on_a_Canon_Printer___Preparing_to_Scan 47
281 ./annotations/Printers/ 18_2___Scan_a_Document_on_a_Canon_Printer___Scanning_on_Windows 30
282 ./annotations/Printers/ 18_3___Scan_a_Document_on_a_Canon_Printer___Scanning_on_Mac 96
283 ./annotations/Printers/ 19_1___Set_up_Your_Laptop_to_Print_Wirelessly___Sharing_a_Printer_Between_Windows_Computers 74
284 ./annotations/Printers/ 19_2___Set_up_Your_Laptop_to_Print_Wirelessly___Sharing_a_Printer_Between_Mac_Computer

362 ./annotations/Ubuntu/ 647_4___Install_Ubuntu_on_VirtualBox___Setting_up_Ubuntu 38
363 ./annotations/Ubuntu/ 648_1___Format_a_Hard_Drive_Using_Ubuntu___Performing_a_Quick_Format 24
364 ./annotations/Ubuntu/ 648_2___Format_a_Hard_Drive_Using_Ubuntu___Using_GParted 62
365 ./annotations/Ubuntu/ 649_1___Install_Windows_from_Ubuntu___Creating_a_Primary_NTFS_Partition_for_Windows 31
366 ./annotations/Ubuntu/ 649_2___Install_Windows_from_Ubuntu___Creating_a_Windows_10_Install_Drive_in_Ubuntu 52
367 ./annotations/Ubuntu/ 649_3___Install_Windows_from_Ubuntu___Running_the_Windows_Installer 16
368 ./annotations/Ubuntu/ 649_4___Install_Windows_from_Ubuntu___Setting_Up_Dual_Boot 22
369 ./annotations/Ubuntu/ 650_1___Format_a_USB_Flash_Drive_in_Ubuntu___Using_the_Disks_Utility 75
370 ./annotations/Ubuntu/ 650_2___Format_a_USB_Flash_Drive_in_Ubuntu___Using_the_Terminal 14
371 ./annotations/Ubuntu/ 651_1___Become_Root_in_Ubuntu___Running_Root_Commands_with_Sudo 14
372 ./annotations/Ubuntu/ 651_2___B

450 ./annotations/Windows/ 271_6___Change_or_Create_Desktop_Icons_for_Windows___Removing_Arrows_from_Shortcut_Icons 88
451 ./annotations/Windows/ 272_1___Hide_the_Windows_Taskbar___Windows_10 25
452 ./annotations/Windows/ 272_2___Hide_the_Windows_Taskbar___Troubleshooting 77
453 ./annotations/Windows/ 272_3___Hide_the_Windows_Taskbar___Troubleshooting_Windows_10 2
454 ./annotations/Windows/ 273_1___Block_a_Program_with_Windows_Firewall___Blocking_a_Program 52
455 ./annotations/Windows/ 273_2___Block_a_Program_with_Windows_Firewall___Temporarily_Disabling_a_Program 87
456 ./annotations/Windows/ 274_1__FindaUsersSIDonWindowsSteps 91
457 ./annotations/Windows/ 275_1___See_Active_Network_Connections_Windows___Accessing_the_Network_and_Sharing_Menu_in_Windows_7_through_10 89
458 ./annotations/Windows/ 275_2___See_Active_Network_Connections_Windows___Using_the_Network_Connections_Folder_in_Windows_7 42
459 ./annotations/Windows/ 275_3___See_Active_Network_Connections_Windows___Using_the_Nets

In [9]:
pickle.dump(all_annotations,open('./annotations/all_files_v1.pkl','wb'))

In [10]:
# parse_annotations(open('temp.txt','w',encoding='utf-8'),169,'./annotations/Ubuntu/','654_2___Set_up_an_FTP_Server_in_Ubuntu_Linux___Configuring_the_FTP_Server','')

In [11]:
all_annotations['199_1___Test_Your_PS4_Controller___Steps']

{'steps': ['Test Your PS4 Controller\nhttps://www.wikihow.com/Test-Your-PS4-Controller',
  '0 - Steps',
  "1 - Connect your PS4 controller to your computer.\nConnect your PS4 controller to your computer. Using a USB to Micro-USB cable, connect your controller to a USB port on your computer. You can usually find these ports on the front or back of your computer tower, on the back of your monitor (if you're using an all-in-one), and along the sides of laptops. The Micro-USB cable plugs in at the top of your PS4 controller.\n- ;",
  "2 - Right-click the Start logo .\nRight-click the Start logo . You'll see this logo in the bottom left corner of your screen if you haven't moved the location of the taskbar.\n- A menu will pop up from your mouse.",
  '3 - Type "Control Panel.\nType "Control Panel." As you type, the search window will show you results.',
  '4 - Click Control Panel to launch the app.\nClick Control Panel to launch the app. You should see this listed as an app under "Best Resul