In [1]:
from selenium.webdriver.common.by import By
from SimulatorCommunicator import setup, switchToIframe, scrollToTheBottom, updateURL, clickElement, takeScreenshot
from PIL import Image

import networkx as nx
import random
import matplotlib.pyplot as plt
import re
import json
import base64
import time
import requests
import io
import itertools
import pandas as pd
import os

In [2]:
def getAllClickables(driver):
    elements = []

    elements_a = driver.find_elements(By.TAG_NAME, 'a')
    for element in elements_a:
        if element.is_displayed() and element.is_enabled() and element.rect.__len__() != 0:  # excluding all the hidden links
            # print(element.get_property('attributes')[0].items(), "\n")

            data = {}
            data['x'] = element.rect['x']
            data['y'] = element.rect['y']
            data['width'] = element.rect['width']
            data['height'] = element.rect['height']
            data['outer_html'] = base64Encode(bytes(element.get_attribute('outerHTML'), "utf-8")) 
            data['href'] = element.get_dom_attribute('href') if element.get_dom_attribute('href') else 'None'
            data['href_full'] = element.get_attribute('href') if element.get_attribute('href') else 'None'
            data['class'] = element.get_attribute('class')

            texts = list(itertools.chain(*[text.text.split('\n') for text in element.find_elements(By.XPATH, '..//div')]))
            texts.append(element.text)
            data['text'] = list(set([text for text in texts if len(text) > 0] ))

            # getting closer image(s) if available
            imgs=element.find_elements(By.XPATH, ".//img")
            if len(imgs) > 0:
                for i, img in enumerate(imgs):
                    data['img_'+str(i)] = imgURL2B64(img.get_attribute('src'))

            elements.append(data)
    return elements

# to get the URL of current page on iframe
def getCurrntURLIFrame(driver):
    url = driver.execute_script('return location.href')
    url = url.replace('http://localhost:4200', "")
    return url

# download image from URL and convert to base64
def imgURL2B64(imgURL):
    # downloading image
    img_data = requests.get(imgURL).content
    # encoding
    return base64Encode(img_data)

# convert bytes string to b64 
def base64Encode(data):
    return base64.b64encode(data).decode("utf-8")

# show image from base64
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# format data to store
def formatData(chosenLink):
    data = {}
    data['y'] = chosenLink['y']
    data['y_offset'] = 0
    data['x'] = chosenLink['x']
    data['x_offset'] = 0
    
    if chosenLink['y'] + chosenLink['height'] > 865:
        data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 865
        data['y'] = chosenLink['y'] - data['y_offset']

    if chosenLink['x'] + chosenLink['width'] > 812:
        data['x_offset'] = chosenLink['x'] + chosenLink['width'] - 812
        data['x'] = chosenLink['x'] - data['x_offset']

    # new attributes to the chosen link
    chosenLink['type'] = "click"
    chosenLink['y'] = data['y']
    chosenLink['y_offset'] = data['y_offset']
    chosenLink['x'] = data['x']
    chosenLink['x_offset'] = data['x_offset']
    return chosenLink

# retrieves all the text on the page
def getTextOnPage(driver):
    divs = driver.find_elements(By.TAG_NAME, 'div')
    return list(set(list(itertools.chain(*[[divText for divText in divs[i].text.split('\n') if len(divText) > 0] for i in range(len(divs))]))))

def formatDataForPlayer(inputJson):
    data = {}
    data['url'] = inputJson['url']

    actions = [];
    for in_action in inputJson['actions'][:-1]:
        action = {}
        action['type'] = in_action['clicked']['type']
        action['x'] = in_action['clicked']['x']
        action['y'] = in_action['clicked']['y']
        action['height'] = in_action['clicked']['height']
        action['width'] = in_action['clicked']['width']
        action['x_offset'] = in_action['clicked']['x_offset']
        action['y_offset'] = in_action['clicked']['y_offset']
        action['href'] = in_action['clicked']['href']
        action['href_full'] = in_action['clicked']['href_full']
        action['outer_html'] = in_action['clicked']['outer_html']

        actions.append(action)
        
    data['actions'] = actions
    return data

In [3]:
# checks one against a list
def isSimilar(query, reference, score):
    points = 0
    # relative url match
    if(query[0]) == reference[0]:
        points += 1

    # absolute url match
    if(query[1]) == reference[1]:
        points += 1
  
    # class match  
    if(query[2]) == reference[2]:
        points += 1 

    return points >= score

# check one url
def checkIfIn(query, urlList, score=2):
    for url in urlList:
        if isSimilar(query, url, score):
            return True
    
    return False

In [19]:
l1 = [
     ('index.html', 'www.manks.com/en/index.html', 'itemLink'),
     ('furniture/index.html', 'www.manks.com/en/furniture/index.html', 'itemLink'),
     ('manks-antiques/index.html', 'www.manks.com/en/manks-antiques/index.html', 'itemLink'),
     ('lighting/index.html', 'www.manks.com/en/lighting/index.html', 'itemLink'),
     ('tableware/index.html', 'www.manks.com/en/tableware/index.html', 'itemLink'),
     ('gifts-decorative/index.html', 'www.manks.com/en/gifts-decorative/index.html', 'itemLink'),
     ('healthy-living/index.html', 'www.manks.com/en/healthy-living/index.html', 'itemLink'),
     ('latest-arrivals/index.html', 'www.manks.com/en/latest-arrivals/index.html', 'itemLink'),
     ('mater-collection/index.html', 'www.manks.com/en/mater-collection/index.html', 'itemLink'),
     ('fredericia-collection/index.html', 'www.manks.com/en/fredericia-collection/index.html', 'itemLink'),
     ('manks-end-of-spring-flash-sale-21-22-may-2022/index.html', 'www.manks.com/en/manks-end-of-spring-flash-sale-21-22-may-2022/index.html', 'itemLink'),

     ('manks-end-of-spring-flash-sale-21-22-may-2022/index.html', 'www.manks.com/en/manks-end-of-spring-flash-sale-21-22-may-2022/index.html', ''),
     ('search/pastor/index.html', 'www.manks.com/en/search/pastor/index.html', ''),
     ('furniture/index.html', 'www.manks.com/en/furniture/index.html', ''),
     ('lighting/index.html', 'www.manks.com/en/lighting/index.html', ''),
     ('tableware/index.html', 'www.manks.com/en/tableware/index.html', ''),
     ('gifts-decorative/index.html', 'www.manks.com/en/gifts-decorative/index.html', ''),
     ('healthy-living/index.html', 'www.manks.com/en/healthy-living/index.html', ''),
     ('manks-antiques/index.html', 'www.manks.com/en/manks-antiques/index.html', ''),
     ('latest-arrivals/index.html', 'www.manks.com/en/latest-arrivals/index.html', ''),
]

l2 = [
     # furniture
     ('chairs/index.html', 'www.manks.com/en/furniture/chairs/index.html', ''),
     ('lounge-chairs/index.html', 'www.manks.com/en/furniture/lounge-chairs/index.html', ''),
     ('sofas/index.html', 'www.manks.com/en/furniture/sofas/index.html', ''),
     ('tables-desks/index.html', 'www.manks.com/en/furniture/tables-desks/index.html', ''),
     ('stools-benches/index.html', 'www.manks.com/en/furniture/stools-benches/index.html', ''),
     ('shelving-storage/index.html', 'www.manks.com/en/furniture/shelving-storage/index.html', ''),
     ('accessories/index.html', 'www.manks.com/en/furniture/accessories/index.html', ''),
     ('outdoor/index.html', 'www.manks.com/en/furniture/outdoor/index.html', ''),

     # lighting
     ('table/index.html', 'www.manks.com/en/lighting/table/index.html', ''),
     ('pendants/index.html', 'www.manks.com/en/lighting/pendants/index.html', ''),
     ('floor/index.html', 'www.manks.com/en/lighting/floor/index.html', ''),
     ('wall-ceiling/index.html', 'www.manks.com/en/lighting/wall-ceiling/index.html', ''),
     ('outdoor/index.html', 'www.manks.com/en/lighting/outdoor/index.html', ''),

     # tableware
     ('glass/index.html', 'www.manks.com/en/tableware/glass/index.html', ''),
     ('chinaware/index.html', 'www.manks.com/en/tableware/chinaware/index.html', ''),
     ('kitchen/index.html', 'www.manks.com/en/tableware/kitchen/index.html', ''),
     ('flatware/index.html', 'www.manks.com/en/tableware/flatware/index.html', ''),

     # gifts and decorative
     ('toikka-birds/index.html', 'www.manks.com/en/gifts-decorative/toikka-birds/index.html', ''),
     ('vase-bowl/index.html', 'www.manks.com/en/gifts-decorative/vase-bowl/index.html', ''),
     ('candle-votive/index.html', 'www.manks.com/en/gifts-decorative/candle-votive/index.html', ''),
     ('sculpture-figurine/index.html', 'www.manks.com/en/gifts-decorative/sculpture-figurine/index.html', ''),
     ('painting-wall-hanging/index.html', 'www.manks.com/en/gifts-decorative/painting-wall-hanging/index.html', ''),
     ('other-gift-decorative/index.html', 'www.manks.com/en/gifts-decorative/other-gift-decorative/index.html', '')
]

l3 = [
     
]

hardToEliminate = [
     ('index.html', 'www.manks.com/en/index.html', 'flag en'),
     ('https://www.manks.com/us/', 'https://www.manks.com/us/', 'flag us'),
     ('session/currency/twd/index.html', 'www.manks.com/en/session/currency/twd/index.html', ''),
     ('session/currency/hkd/index.html', 'www.manks.com/en/session/currency/hkd/index.html', 'active'),
]

eliminate = [
     ('session/currency/cny/index.html', 'www.manks.com/en/session/currency/cny/index.html', ''),
     ('session/currency/mop/index.html', 'www.manks.com/en/session/currency/mop/index.html', ''),
     ('cart/index.html', 'www.manks.com/en/cart/index.html', 'cart'),
     ('account/index.html', 'www.manks.com/en/account/index.html', 'my-account'),
     ('index.html', 'www.manks.com/en/index.html', ''),

     ('http://www.manksquarters.com/main/mqdeal.asp', 'http://www.manksquarters.com/main/mqdeal.asp', ''),
     ('blogs/stories/index.html', 'www.manks.com/en/blogs/stories/index.html', 'itemLink'),
     ('blogs/relive-the-home-of-borge-mogensen/index.html', 'www.manks.com/en/blogs/relive-the-home-of-borge-mogensen/index.html', 'itemLink'),
]

pageBottomURLs = [
     ('https://www.facebook.com/ManksLtd/', 'https://www.facebook.com/ManksLtd/', 'social-icon facebook'),
     ('https://twitter.com/manks_hk', 'https://twitter.com/manks_hk', 'social-icon twitter'),
     ('https://www.pinterest.com/manks_hk/', 'https://www.pinterest.com/manks_hk/', 'social-icon pinterest'),
     ('https://www.youtube.com/channel/UCJqW32INbVB3jTH_DUl-2lw', 'https://www.youtube.com/channel/UCJqW32INbVB3jTH_DUl-2lw', 'social-icon youtube'),
     ('https://www.instagram.com/manks_hk/', 'https://www.instagram.com/manks_hk/', 'social-icon instagram'),
     ('service/index.html', 'www.manks.com/en/service/index.html', ''),
     ('service/where-are-we/index.html', 'www.manks.com/en/service/where-are-we/index.html', ''),
     ('service/about/index.html', 'www.manks.com/en/service/about/index.html', ''),
     ('service/shipping-returns/index.html', 'www.manks.com/en/service/shipping-returns/index.html', ''),
     ('service/payment-methods/index.html', 'www.manks.com/en/service/payment-methods/index.html', ''),
     ('service/general-terms-conditions/index.html', 'www.manks.com/en/service/general-terms-conditions/index.html', ''),
     ('service/privacy-policy/index.html', 'www.manks.com/en/service/privacy-policy/index.html', ''),
     ('service/disclaimer/index.html', 'www.manks.com/en/service/disclaimer/index.html', ''),
     ('service/career/index.html', 'www.manks.com/en/service/career/index.html', ''),
     ('collection/index.html', 'www.manks.com/en/collection/index.html', ''),
     ('collection/index8676.html?sort=newest', 'www.manks.com/en/collection/index8676.html?sort=newest', ''),
     ('collection/offers/index.html', 'www.manks.com/en/collection/offers/index.html', ''),
     ('tags/index.html', 'www.manks.com/en/tags/index.html', ''),
     ('account/index.html', 'www.manks.com/en/account/index.html', ''),
     ('account/orders/index.html', 'www.manks.com/en/account/orders/index.html', ''),
     ('account/wishlist/index.html', 'www.manks.com/en/account/wishlist/index.html', ''),
     ('mailto:info@manks.com', 'mailto:info@manks.com', ''),
     ('https://www.visa.com.hk/', 'https://www.visa.com.hk/', ''),
     ('https://www.mastercard.com.hk/', 'https://www.mastercard.com.hk/', ''),
     ('https://www.americanexpress.com/hk/', 'https://www.americanexpress.com/hk/', ''),
     ('https://www.alipay.hk/', 'https://www.alipay.hk/', ''),
     ('https://pay.wechat.com/en_hk/index.shtml', 'https://pay.wechat.com/en_hk/index.shtml', ''),
     ('http://www.lightspeedhq.com/', 'http://www.lightspeedhq.com/', ''),
     ('service/payment-methods/index.html', 'www.manks.com/en/service/payment-methods/index.html', ''),
     ('service/payment-methods/index.html', 'www.manks.com/en/service/payment-methods/index.html', ''),
     ('service/payment-methods/index.html', 'www.manks.com/en/service/payment-methods/index.html', ''),
     ('service/payment-methods/index.html', 'www.manks.com/en/service/payment-methods/index.html', '')
]

forceClick = [

]

def l3Handle(l2):
     if ("womens" in l2['href']):
          return l3[:7]
     elif("kids" in l2['href']):
          return l3[7:12]
     elif("mens" in l2['href']):
          return l3[12:]

def matchSimilarLinks(candidate_clickables):
     base = "http://localhost:4200/assets/crawled/manks/"

     if any(clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2, 2)) or any(clickable for clickable in candidate_clickables if clickable['href'] == 'None' and clickable['class'] != 'zoom first'):
          return candidate_clickables

     # x=[x in clickable['href_full'] for clickable in candidate_clickables for x in ["furniture", "lighting", "tableware", "gifts-decorative", "healthy-living", "None"]]
     
     # add text of second one to the first one
     f_list = []
     for x, y in zip(candidate_clickables[::2], candidate_clickables[1::2]):
          x['text'] = y['text']
          f_list.append(x)
     return f_list

def checkVisitedNones(candidate_clickables, trackNones):
     for none in trackNones:
          candidate_clickables = [clickable for clickable in candidate_clickables if not (clickable['href'] == 'None' and clickable['x'] == none['x'] and clickable['y'] == none['y'])]
     return candidate_clickables


In [6]:
# # removing stepitr/code used to generate lot of data start from middle button
# base = "http://localhost:4200/assets/crawled/sna/www.sarahandabraham.com/index.html"

# def collectData():
#     stepitr = 3
#     dpoint=1
#     no_products = 0 # flag to identify if all the products are visited
#     selected_products = []
#     while dpoint <= 500: # no. of recordings needed from one seq. count
#         driver = setup(9222, True)
#         switchToIframe(driver)

#         menuState = 'non'
#         visited = [] # to store visited links in one sequence
#         data = {} # to collect data in one sequence
#         data['url'] = "assets/crawled/sna/www.sarahandabraham.com/index.html"
#         actions = [] # to store clicks

#         try:
#             # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
#             step=0
#             while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page
#             # for step in range(stepitr + 1):
#                 scrollToTheBottom(driver)

#                 candidate_clickables = getAllClickables(driver)
#                 forRecords = candidate_clickables.copy()

#                 # remove everytime
#                 candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
                
#                 # eliminating already visited ones
#                 candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

#                 # keep only varients if at least 2 more steps aren't available
#                 if step != 0 and any(x in actions[-1]['clicked']['href'] for x in ['products', 'variant']) and step + 2 >= stepitr:
#                     candidate_clickables = [clickable for clickable in candidate_clickables if '?variant' in clickable['href']]
#                     step = step-1 if len(candidate_clickables) >= 2 else step
#                     step = stepitr if len(actions) == 6 and len(candidate_clickables) > 0 else step
                
#                 if step==1 and actions[-1]['clicked']['href'] in "/index.html": # to click on a product in the 1st step
#                     candidate_clickables = [clickable for clickable in candidate_clickables if '/products' in clickable['href']]

#                 # adding constraints to click a button
#                 if dpoint <= 1000 and step == 0:
#                     clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
#                 else:
#                     if menuState == 'non':
#                     # eliminating unwanted urls
#                         clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
#                     elif menuState == 'l1':
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
#                     elif menuState == 'l2':
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]['clicked']))]

#                 # choosing a clickable randomly
#                 if step < stepitr:
#                     # removing already visited products
#                     if step==1 and actions[-1]['clicked']['href'] in "/index.html" and len(selected_products)>0:
#                         clickables = [clickable for clickable in clickables if clickable['href_full'] not in selected_products]
#                         no_products = no_products+1 if len(clickables)==0 else 0 # to identify if all the products were visited

#                     random.seed()
#                     random.shuffle(clickables)
#                     random.shuffle(clickables)
#                     chosenLink = random.choice(clickables)

#                     if step==1 and '/products' in chosenLink['href']:
#                         selected_products.append(chosenLink['href_full'])
                    
#                     # assigning the menu state
#                     if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
#                         menuState = 'l1'
#                     elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
#                         menuState = 'l2'
#                     else:
#                         menuState = 'non'
                    
#                     # adding to the visited list
#                     visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

#                 actions.append({
#                     'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
#                     'candidates': [formatData(clickable.copy()) for clickable in forRecords],
#                     'screenshot': takeScreenshot(driver),
#                     'full_url': getCurrntURLIFrame(driver),
#                     'text': getTextOnPage(driver)
#                     })

#                 # print(chosenLink['href_full'])
#                 if step < stepitr:
#                     clickElement(driver, chosenLink['x'], chosenLink['y'])
#                 step += 1

#             data['actions'] = actions
#             data['nClicks'] = len(actions)-1

#             # saving to a json file for ML
#             fname = 'R_' + str(int(time.time())) + '_' + str(len(actions)-1)
#             with open('data/' + fname + '.json', 'w') as f:
#                 json.dump(data, f)

#             # saving to a json file for player
#             with open('data_player/' + fname + '.json', 'w') as f:
#                 json.dump(formatDataForPlayer(data), f)
            
#             dpoint+=1 # increasing the iteration number
#             print(fname, [action['clicked']['href'] for action in actions[:-1]])

#             # break
#         except Exception as e:
#             print("error", e)
#             if no_products == 3:
#                 selected_products.clear()
#         #     # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
#         #     # print(chosenLink['href_full'], '\n')
#         #     # if actions.__len__() > 0:
#         #     #     print([action['href'] for action in actions])
#         #     # print('\033[0m', "-----------------------------------------------------")
# collectData()

In [24]:
# start from menu
base = "http://localhost:4200/assets/crawled/manks/"

def collectData():
    stepitr = 4
    dpoint=1
    no_products = 0 # flag to identify if all the products are visited
    selected_products = []
    while dpoint <= 500: # no. of recordings needed from one seq. count
        driver = setup(9222, 'assets/crawled/manks/index.html', True)
        switchToIframe(driver)

        menuState = 'non'
        visited = [] # to store visited links in one sequence
        data = {} # to collect data in one sequence
        data['url'] = "assets/crawled/manks/index.html"
        actions = [] # to store clicks
        nextTrack = False
        nextActiveinDataPoint = False

        trackNones = []
        try:
            step=0
            while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page

                scrollToTheBottom(driver)
                candidate_clickables = getAllClickables(driver)
                forRecords = candidate_clickables.copy()

                # remove everytime
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate+pageBottomURLs+hardToEliminate, 2)]
                candidate_clickables = [clickable for clickable in candidate_clickables if "flag" not in clickable['class']]
                candidate_clickables = [clickable for clickable in candidate_clickables if "currency" not in clickable['href_full']]

                # eliminating already visited ones
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]
                # removing none links
                candidate_clickables = checkVisitedNones(candidate_clickables, trackNones)

                # adding constraints to click a button
                if step == 0:
                    candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1[1:2], 2)]
                else:
                    candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 2)]

                if menuState == 'l1' or  menuState == 'l2' or nextTrack:
                    candidate_clickables = matchSimilarLinks(candidate_clickables)
                
                # identifying if in a target page
                if any(clickable['href'] == 'None' for clickable in candidate_clickables):
                    candidate_clickables = [clickable for clickable in candidate_clickables if clickable['href'] == 'None' and clickable['class'] != 'zoom first']
                    maxItr =  5 if nextActiveinDataPoint else 4
                    step = step-1 if len(candidate_clickables) > 1 and len(actions) <= maxItr else step
                
                if step >= 3 and len(actions) <= 6 and not any(clickable['href'] == 'None' for clickable in candidate_clickables):
                    step-=1

                # choosing a clickable randomly
                if step < stepitr:
                    # removing already visited products
                    if len(actions) > 0 and len(selected_products)>0:
                        candidate_clickables = [clickable for clickable in candidate_clickables if clickable['href_full'] not in selected_products]
                        no_products = no_products+1 if len(candidate_clickables)==0 else no_products # to identify if all the products were visited

                    random.seed()
                    random.shuffle(candidate_clickables)
                    chosenLink = random.choice(candidate_clickables)

                    if step >= 1 and not any(x in chosenLink['href_full'] for x in ["furniture", "lighting", "tableware", "gifts-decorative", "healthy-living", "None"]):
                    # if step >= 1 and menuState == "non":
                        selected_products.append(chosenLink['href_full'])

                    # assigning the menu state
                    if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                        menuState = 'l1'
                    elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2):
                        menuState = 'l2'
                    else:
                        menuState = 'non'
                    
                    # adding to the visited list
                    if chosenLink['href'] == 'None':
                        trackNones.append({'x':chosenLink['x'], 'y': chosenLink['y']})
                    else:
                        visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))
                    
                    if 'page' in chosenLink['href_full']:
                        nextTrack = True
                        nextActiveinDataPoint = True
                        step-=1
                    else:
                        nextTrack = False

                actions.append({
                    'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
                    'candidates': [formatData(clickable.copy()) for clickable in forRecords],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

                # print(chosenLink['href_full'])
                if step < stepitr:
                    clickElement(driver, chosenLink['x'], chosenLink['y'])
                step += 1

            data['actions'] = actions
            data['nClicks'] = len(actions)-1

            # saving to a json file for ML
            fname = 'R_' + str(int(time.time())) + '_MA_' + str(len(actions)-1)
            with open('data/' + fname + '.json', 'w') as f:
                json.dump(data, f)

            # saving to a json file for player
            with open('data_player/' + fname + '.json', 'w') as f:
                json.dump(formatDataForPlayer(data), f)
            
            dpoint+=1 # increasing the iteration number
            print(fname, [action['clicked']['href'] for action in actions[:-1]])
            
            # break
        except Exception as e:
            print("error", e)
            if no_products >= 3:
                selected_products = []
                no_products = 0
collectData()

R_1674965067_MA_7 ['furniture/index.html', 'tables-desks/index.html', '../../accent-oval-lounge-table.html', 'None', 'None', 'None', 'None']


Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 1078, in _pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch
  File "_pydevd_bundle/pydevd_cython.pyx", line 297, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "/Users/dileepa/opt/miniconda3/envs/wvln/lib/python3.9/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 1976, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
  File "/Users/dileepa/opt/miniconda3/envs/wvln/lib/python3.9/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2011, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


KeyboardInterrupt: 

In [120]:
driver = setup(9222, 'assets/crawled/manks/index.html', False)
switchToIframe(driver)
# elements_a = driver.find_elements(By.TAG_NAME, 'a')

candidate_clickables = getAllClickables(driver)
# candidate_clickables = [(clickable['href'], clickable['href_full'].replace(base,""), clickable['class']) for clickable in getAllClickables(driver)]
candidate_clickables = [(clickable['href'], clickable['href_full'].replace(base,""), clickable['class']) for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1+pageBottomURLs, 2)]


# data = {
#     'href':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'href_full':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'x': 0,
#     'y': 0}
candidate_clickables

[('../../go/category/10637206/index.html',
  'www.manks.com/en/go/category/10637206/index.html',
  'flag en'),
 ('https://www.manks.com/us/go/category/10637206',
  'https://www.manks.com/us/go/category/10637206',
  'flag us'),
 ('../../session/currency/cny/index.html',
  'www.manks.com/en/session/currency/cny/index.html',
  ''),
 ('../../session/currency/hkd/index.html',
  'www.manks.com/en/session/currency/hkd/index.html',
  ''),
 ('../../session/currency/mop/index.html',
  'www.manks.com/en/session/currency/mop/index.html',
  ''),
 ('../../session/currency/twd/index.html',
  'www.manks.com/en/session/currency/twd/index.html',
  'active'),
 ('../../cart/index.html', 'www.manks.com/en/cart/index.html', 'cart'),
 ('../../account/index.html',
  'www.manks.com/en/account/index.html',
  'my-account'),
 ('../../index.html', 'www.manks.com/en/index.html', ''),
 ('../../blogs/stories/index.html',
  'www.manks.com/en/blogs/stories/index.html',
  'itemLink'),
 ('../../blogs/relive-the-home-of-b

# To show an image

In [130]:
# to load an image
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

with open('data/R_1674899284_MA_4.json', 'r') as f:
    imgFromB64(json.load(f)['actions'][1]['screenshot'])
    # imgFromB64(json.load(f)['actions'][1]['candidates'][31]['img_0'])

# Unused

In [73]:
def collectData(n_data):
    driver = setup()
    switchToIframe(driver)

    for i in range(n_data):
        # scrollToTheBottom(driver)
        # clickables = getAllClickables(driver)
        # chosenLink = random.choice(clickables[10:])
        # print(chosenLink)

        clickElement(driver, 414, 3000)
        break

        

# collectData(5)

# To delete paths

In [100]:
feedback = pd.read_csv("Data Quality Checker.csv")
feedback.head()

Unnamed: 0,File,Good,Bad
0,1671539062_3,1.0,
1,1671539104_3,1.0,
2,1671539149_3,,1.0
3,1671539201_3,1.0,
4,1671539251_3,1.0,


In [101]:
# deleting bad ones
feedbackBad = feedback.copy(deep=True)
feedbackBad.File = feedbackBad.File.map(lambda x: 'R_'+ x + '.json')
bad = feedbackBad.loc[feedbackBad.Bad == 1].File.tolist()

videos = 'data_player/'
mlData = 'data/'
for (dirpath, dirnames, filenames) in os.walk(mlData):
    for filename in filenames:
        if filename in bad:
            os.remove(dirpath + filename)
            # print(dirpath, filename)

In [102]:
# adding good ones to a csv
feedback.File = feedback.File.map(lambda x: 'R_'+ x)
good = feedback.loc[feedback.Good == 1].File.tolist()
pd.DataFrame({'token':good}).to_csv('tokens.csv', index=False)