In [1]:
# !pip install selenium
# !pip install networkx
# !pip install matplotlib
# !pip install webdriver-manager
# !pip install pandas

In [2]:
from selenium.webdriver.common.by import By
from SimulatorCommunicator import setup, switchToIframe, scrollToTheBottom, updateURL, clickElement, takeScreenshot
from PIL import Image

import networkx as nx
import random
import matplotlib.pyplot as plt
import re
import json
import base64
import time
import requests
import io
import itertools
import pandas as pd
import os

In [3]:
def getAllClickables(driver):
    elements = []

    elements_a = driver.find_elements(By.TAG_NAME, 'a')
    for element in elements_a:
        if element.is_displayed() and element.is_enabled() and element.rect.__len__() != 0:  # excluding all the hidden links
            # print(element.get_property('attributes')[0].items(), "\n")

            data = {}
            data['x'] = element.rect['x'] + (element.rect['width']/2)
            data['y'] = element.rect['y'] + (element.rect['height']/2)
            data['width'] = element.rect['width']
            data['height'] = element.rect['height']
            data['outer_html'] = base64Encode(bytes(element.get_attribute('outerHTML'), "utf-8")) 
            data['href'] = element.get_dom_attribute('href')
            data['href_full'] = element.get_attribute('href')
            data['class'] = element.get_attribute('class')

            texts = list(itertools.chain(*[text.text.split('\n') for text in element.find_elements(By.XPATH, '..//div')]))
            texts.append(element.text)
            data['text'] = list(set([text for text in texts if len(text) > 0] ))

            # getting closer image(s) if available
            imgs=element.find_elements(By.XPATH, ".//img")
            if len(imgs) > 0:
                for i, img in enumerate(imgs):
                    data['img_'+str(i)] = imgURL2B64(img.get_attribute('src'))

            elements.append(data)
    return elements

# to get the URL of current page on iframe
def getCurrntURLIFrame(driver):
    url = driver.execute_script('return location.href')
    url = url.replace('http://localhost:4200', "")
    return url

# download image from URL and convert to base64
def imgURL2B64(imgURL):
    # downloading image
    img_data = requests.get(imgURL).content
    # encoding
    return base64Encode(img_data)

# convert bytes string to b64 
def base64Encode(data):
    return base64.b64encode(data).decode("utf-8")

# show image from base64
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# format data to store
def formatData(chosenLink):
    data = {}
    data['y'] = chosenLink['y']
    data['y_offset'] = 0

    # if (chosenLink['y'] + chosenLink['height']) > 855:
    #     data['y_offset'] = ((chosenLink['y']) / 855) * 855
    #     data['y'] = chosenLink['y'] - (((chosenLink['y']) / 855) * 855)
    
    # if chosenLink['y'] / 855 > 1 :
    
    if chosenLink['y'] + chosenLink['height'] > 865:
        data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 865
        data['y'] = chosenLink['y'] - data['y_offset']
    
    # print(chosenLink['y'], data['y_offset'], data['y'])
    # if (chosenLink['y'] + chosenLink['height']) > 720:
    #     data['y'] = chosenLink['y'] - (chosenLink['y'] - 720 - chosenLink['height'])
    #     data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 720

    # new attributes to the chosen link
    chosenLink['type'] = "click"
    chosenLink['y'] = data['y']
    chosenLink['x_offset'] = 0
    chosenLink['y_offset'] = data['y_offset']

    return chosenLink

# retrieves all the text on the page
def getTextOnPage(driver):
    divs = driver.find_elements(By.TAG_NAME, 'div')
    return list(set(list(itertools.chain(*[[divText for divText in divs[i].text.split('\n') if len(divText) > 0] for i in range(len(divs))]))))

def formatDataForPlayer(inputJson):
    data = {}
    data['url'] = inputJson['url']

    actions = [];
    for in_action in inputJson['actions'][:-1]:
        action = {}
        action['type'] = in_action['clicked']['type']
        action['x'] = in_action['clicked']['x']
        action['y'] = in_action['clicked']['y']
        action['height'] = in_action['clicked']['height']
        action['width'] = in_action['clicked']['width']
        action['x_offset'] = in_action['clicked']['x_offset']
        action['y_offset'] = in_action['clicked']['y_offset']
        action['href'] = in_action['clicked']['href']
        action['href_full'] = in_action['clicked']['href_full']
        action['outer_html'] = in_action['clicked']['outer_html']

        actions.append(action)
        
    data['actions'] = actions
    return data

In [4]:
x = """<a href="collections/womens-new-arrivals.html" class="button
                        button-medium
                        button-white
                        txt-size-2
                        txt-tracked-two-point">
                 Shop New Arrivals
              </a>"""
base64.b64encode(bytes(x, "utf-8")).decode("utf-8")

'PGEgaHJlZj0iY29sbGVjdGlvbnMvd29tZW5zLW5ldy1hcnJpdmFscy5odG1sIiBjbGFzcz0iYnV0dG9uCiAgICAgICAgICAgICAgICAgICAgICAgIGJ1dHRvbi1tZWRpdW0KICAgICAgICAgICAgICAgICAgICAgICAgYnV0dG9uLXdoaXRlCiAgICAgICAgICAgICAgICAgICAgICAgIHR4dC1zaXplLTIKICAgICAgICAgICAgICAgICAgICAgICAgdHh0LXRyYWNrZWQtdHdvLXBvaW50Ij4KICAgICAgICAgICAgICAgICBTaG9wIE5ldyBBcnJpdmFscwogICAgICAgICAgICAgIDwvYT4='

In [5]:
# checks one against a list
def isSimilar(query, reference, score):
    points = 0
    # relative url match
    if(query[0]) == reference[0]:
        points += 1

    # absolute url match
    if(query[1]) == reference[1]:
        points += 1
  
    # class match  
    if(query[2]) == reference[2]:
        points += 1 

    return points >= score

# check one url
def checkIfIn(query, urlList, score=2):
    for url in urlList:
        if isSimilar(query, url, score):
            return True
    
    return False

In [75]:
l1 = [
     ('#', 'index.html#', 'nav-link\n                js-top-drawer-click')
]

l2 = [
     # ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'topnav-mobile-link'),
     # ('collections/womens.html', 'collections/womens.html', 'topnav-mobile-link'), 
     ('collections/kids.html', 'collections/kids.html', 'topnav-mobile-link'),
     # ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
]

l3 = [
     ('collections/womens.html', 'collections/womens.html', 'topnav-mobile-link'),
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'topnav-mobile-link'),
     ('collections/womens-socks.html', 'collections/womens-socks.html', 'topnav-mobile-link'),
     ('collections/womens-tights-and-leggings.html', 'collections/womens-tights-and-leggings.html', 'topnav-mobile-link'),
     ('collections/womens-clothing.html', 'collections/womens-clothing.html', 'topnav-mobile-link'),
     ('collections/womens-bags.html', 'collections/womens-bags.html', 'topnav-mobile-link'),
     ('collections/womens-sale.html', 'collections/womens-sale.html', 'topnav-mobile-link'),

     ('collections/kids.html', 'collections/kids.html', 'topnav-mobile-link'),
     ('collections/kids-new-arrivals.html', 'collections/kids-new-arrivals.html', 'topnav-mobile-link'),
     ('collections/kids-socks.html', 'collections/kids-socks.html', 'topnav-mobile-link'),
     ('collections/kids-tights-and-leggings.html', 'collections/kids-tights-and-leggings.html', 'topnav-mobile-link'),
     ('collections/kids-sale.html', 'collections/kids-sale.html', 'topnav-mobile-link'),

     ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
     ('collections/mens-new-arrivals-1.html', 'collections/mens-new-arrivals-1.html', 'topnav-mobile-link'),
     ('collections/mens-socks.html', 'collections/mens-socks.html', 'topnav-mobile-link'),
     ('collections/mens-sale.html', 'collections/mens-sale.html', 'topnav-mobile-link'),
]

eliminate = [
     ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
     ('collections/womens-tights-and-leggings.html', 'collections/womens-tights-and-leggings.html', 'subnav-link\n                        '),
     ('collections/womens-bags.html', 'collections/womens-bags.html', 'subnav-link\n                        '),
     ('collections/womens-gift-card.html', 'collections/womens-gift-card.html', 'subnav-link\n                        '),
     ('collections/womens-sale.html', 'collections/womens-sale.html', 'subnav-link\n                        '),
     ('collections/womens-sporty-series.html', 'collections/womens-sporty-series.html', 'subnav-link\n                        '),
     ('collections/womens-conversational-crews.html', 'collections/womens-conversational-crews.html', 'subnav-link\n                        '),
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', ''),
     ('collections/kids-gift-card.html', 'collections/kids-gift-card.html', 'subnav-link\n                        '),
     ('collections/kids-sale.html', 'collections/kids-sale.html', 'subnav-link\n                        '),
     ('collections/kids-new-arrivals.html', 'collections/kids-new-arrivals.html', ''),
     ('collections/mens-new-arrivals-1.html', 'collections/mens-new-arrivals-1.html', ''),
     ('https://instagram.com/hanselfrombasel/', 'https://instagram.com/hanselfrombasel/', 'icon-instagram\n                    footer-link\n                    footer-link-social'),
     ('https://www.facebook.com/HanselfromBasel', 'https://www.facebook.com/HanselfromBasel', 'icon-facebook\n                    footer-link\n                    footer-link-social'),
     ('https://www.pinterest.com/hanselfrombasel/', 'https://www.pinterest.com/hanselfrombasel/', 'icon-pinterest\n                    footer-link\n                    footer-link-social'),

     ('cart.html', 'cart.html', 'nav-link\n                nav-link-mobile\n                js-drawer-toggle'),
     ('pages/shipping.html', 'pages/shipping.html', ''),
     ('account/login.html#sign-in', 'account/login.html#sign-in', 'nav-link'),
     ('index.html', 'index.html', 'nav-lockup'),
     ('search.html', 'search.html', 'nav-link\n                nav-link-mobile'),
     ('womens-sale.html', 'http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/collections/womens-sale.html', 'subnav-link\n                        \n                          is-active\n                        '),
]

pageBottomURLs = [
     # page bottom urls
     ('pages/contact.html', 'pages/contact.html', 'footer-link'),
     ('pages/shipping.html', 'pages/shipping.html', 'footer-link'),
     ('pages/returns.html', 'pages/returns.html', 'footer-link'),
     ('pages/size-guide.html', 'pages/size-guide.html', 'footer-link'),
     ('pages/care-instructions.html', 'pages/care-instructions.html', 'footer-link'),
     ('pages/promotion-details.html', 'pages/promotion-details.html', 'footer-link'),
     ('pages/ccpa-opt-out.html', 'pages/ccpa-opt-out.html', 'footer-link'),
     ('policies/privacy-policy.html', 'policies/privacy-policy.html', 'footer-link'),
     ('pages/korean-buying-guide.html', 'pages/korean-buying-guide.html', 'footer-link'),
     ('https://instagram.com/hanselfrombasel/', 'https://instagram.com/hanselfrombasel/', 'icon-instagram\n                    footer-link\n                    footer-link-social'),
     ('https://www.facebook.com/HanselfromBasel', 'https://www.facebook.com/HanselfromBasel', 'icon-facebook\n                    footer-link\n                    footer-link-social'),
     ('https://www.pinterest.com/hanselfrombasel/', 'https://www.pinterest.com/hanselfrombasel/', 'icon-pinterest\n                    footer-link\n                    footer-link-social'),
     ('pages/wholesale.html', 'pages/wholesale.html', 'footer-link'),
     ('pages/press.html', 'pages/press.html', 'footer-link'),
     ('pages/our-story.html', 'pages/our-story.html', 'footer-link'),
     ('pages/stockists.html', 'pages/stockists.html', 'footer-link'),
]

forceClick = [
     # ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'button\n                        button-medium\n                        button-white\n                        txt-size-2\n                        txt-tracked-two-point'),
     ('#', 'index.html#', 'nav-link\n                js-top-drawer-click')
]

def l3Handle(l2):
     if ("womens" in l2['href']):
          return l3[:7]
     elif("kids" in l2['href']):
          return l3[7:12]
     elif("mens" in l2['href']):
          return l3[12:]

In [9]:
# # base code
# base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

# def collectData():
#     for stepitr in range(3,8): #3,16
#         dpoint=1
#         while dpoint <= 1: # no. of recordings needed from one seq. count
#             driver = setup(True)
#             switchToIframe(driver)

#             menuState = 'non'
#             visited = [] # to store visited links in one sequence
#             data = {} # to collect data in one sequence
#             data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
#             actions = [] # to store clicks

#             try:
#                 # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
#                 for step in range(stepitr): # going through steps in a single data point
#                     scrollToTheBottom(driver)

#                      # adding constraints to click a button
#                     if dpoint <= 4 and step == 0:
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 3)]
#                     elif dpoint <= 8 and step == 0:
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                    
#                     else:
#                         if menuState == 'non':
#                         # eliminating unwanted urls
#                             clickables = [clickable for clickable in getAllClickables(driver) if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
#                         elif menuState == 'l1':
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
#                         elif menuState == 'l2':
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]))]

#                     # eliminating already visited ones
#                     clickables = [clickable for clickable in clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited)]

#                     # choosing a clickable randomly
#                     chosenLink = random.choice(clickables[:])

#                     if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
#                         menuState = 'l1'
#                     elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
#                         menuState = 'l2'
#                     else:
#                         menuState = 'non'

#                     # adding to the visited list
#                     visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

#                     # print(chosenLink['href_full'])
#                     clickElement(driver, chosenLink['x'], chosenLink['y'])
#                     actions.append(formatData(chosenLink))
                
#                 data['actions'] = actions
#                 # saving to a json file
#                 fname = str(int(time.time())) + '_' + str(stepitr)
#                 with open('data/R_' + fname + '.json', 'w') as f:
#                     json.dump(data, f)
                
#                 dpoint+=1 # increasing the iteration number
#                 print(fname, [action['href'] for action in actions])

#                 # break
#             except Exception as e:
#                 pass
#                 # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
#                 # print(chosenLink['href_full'], '\n')
#                 # if actions.__len__() > 0:
#                 #     print([action['href'] for action in actions])
#                 # print('\033[0m', "-----------------------------------------------------")

# collectData()

In [None]:
# removing stepitr/code used to generate lot of data start from middle button
base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

def collectData():
    stepitr = 3
    dpoint=1
    no_products = 0 # flag to identify if all the products are visited
    selected_products = []
    while dpoint <= 500: # no. of recordings needed from one seq. count
        driver = setup(9222, True)
        switchToIframe(driver)

        menuState = 'non'
        visited = [] # to store visited links in one sequence
        data = {} # to collect data in one sequence
        data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
        actions = [] # to store clicks

        try:
            # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
            step=0
            while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page
            # for step in range(stepitr + 1):
                scrollToTheBottom(driver)

                candidate_clickables = getAllClickables(driver)
                forRecords = candidate_clickables.copy()

                # remove everytime
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
                
                # eliminating already visited ones
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

                # keep only varients if at least 2 more steps aren't available
                if step != 0 and any(x in actions[-1]['clicked']['href'] for x in ['products', 'variant']) and step + 2 >= stepitr:
                    candidate_clickables = [clickable for clickable in candidate_clickables if '?variant' in clickable['href']]
                    step = step-1 if len(candidate_clickables) >= 2 else step
                    step = stepitr if len(actions) == 6 and len(candidate_clickables) > 0 else step
                
                if step==1 and actions[-1]['clicked']['href'] in "/index.html": # to click on a product in the 1st step
                    candidate_clickables = [clickable for clickable in candidate_clickables if '/products' in clickable['href']]

                # adding constraints to click a button
                if dpoint <= 1000 and step == 0:
                    clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                else:
                    if menuState == 'non':
                    # eliminating unwanted urls
                        clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
                    elif menuState == 'l1':
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
                    elif menuState == 'l2':
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]['clicked']))]

                # choosing a clickable randomly
                if step < stepitr:
                    # removing already visited products
                    if step==1 and actions[-1]['clicked']['href'] in "/index.html" and len(selected_products)>0:
                        clickables = [clickable for clickable in clickables if clickable['href_full'] not in selected_products]
                        no_products = no_products+1 if len(clickables)==0 else 0 # to identify if all the products were visited

                    random.seed()
                    random.shuffle(clickables)
                    random.shuffle(clickables)
                    chosenLink = random.choice(clickables)

                    if step==1 and '/products' in chosenLink['href']:
                        selected_products.append(chosenLink['href_full'])
                    
                    # assigning the menu state
                    if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                        menuState = 'l1'
                    elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
                        menuState = 'l2'
                    else:
                        menuState = 'non'
                    
                    # adding to the visited list
                    visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

                actions.append({
                    'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
                    'candidates': [formatData(clickable.copy()) for clickable in forRecords],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

                # print(chosenLink['href_full'])
                if step < stepitr:
                    clickElement(driver, chosenLink['x'], chosenLink['y'])
                step += 1

            data['actions'] = actions
            data['nClicks'] = len(actions)-1

            # saving to a json file for ML
            fname = 'R_' + str(int(time.time())) + '_' + str(len(actions)-1)
            with open('data/' + fname + '.json', 'w') as f:
                json.dump(data, f)

            # saving to a json file for player
            with open('data_player/' + fname + '.json', 'w') as f:
                json.dump(formatDataForPlayer(data), f)
            
            dpoint+=1 # increasing the iteration number
            print(fname, [action['clicked']['href'] for action in actions[:-1]])

            # break
        except Exception as e:
            print("error", e)
            if no_products == 3:
                selected_products.clear()
        #     # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
        #     # print(chosenLink['href_full'], '\n')
        #     # if actions.__len__() > 0:
        #     #     print([action['href'] for action in actions])
        #     # print('\033[0m', "-----------------------------------------------------")
collectData()

In [83]:
# start from menu
base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

def collectData():
    stepitr = 4
    dpoint=1
    no_products = 0 # flag to identify if all the products are visited
    selected_products = []
    while dpoint <= 500: # no. of recordings needed from one seq. count
        driver = setup(9222, True)
        switchToIframe(driver)

        menuState = 'non'
        visited = [] # to store visited links in one sequence
        data = {} # to collect data in one sequence
        data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
        actions = [] # to store clicks

        try:
            # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
            step=0
            while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page
            # for step in range(stepitr + 1):
                scrollToTheBottom(driver)

                candidate_clickables = getAllClickables(driver)
                forRecords = candidate_clickables.copy()

                # remove everytime
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
                # removing invisible links
                candidate_clickables = [clickable for clickable in candidate_clickables if not clickable['y']<74]

                # if "/index.html" in getCurrntURLIFrame(driver):
                #     candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1+l2+l3, 2)]

                # eliminating already visited ones
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

                # keep only varients if at least 2 more steps aren't available
                if step != 0 and any(x in actions[-1]['clicked']['href'] for x in ['products', 'variant']) and step + 2 >= stepitr:
                    candidate_clickables = [clickable for clickable in candidate_clickables if '?variant' in clickable['href']]
                    step = step-1 if len(candidate_clickables) >= 2 else step
                    step = stepitr if len(actions) == 6 and len(candidate_clickables) > 0 else step 

                # if it's 4th step and still not in a product page add one more step
                if step>=3 and len(actions)<6 and not any(x in actions[-1]['clicked']['href'] for x in ['products', 'variant']):
                    step-=1

                # adding constraints to click a button
                if dpoint <= 1000 and step == 0:
                    candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                else:
                    if menuState == 'non':
                    # eliminating unwanted urls
                        candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate+l1+l2+l3, 2)]
                    elif menuState == 'l1':
                        candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2, 3)]
                    elif menuState == 'l2':
                        candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]['clicked']), 3)]

                # choosing a clickable randomly
                if step < stepitr:
                    # removing already visited products
                    if step!=0 and actions[-1]['clicked']['href'] in "/index.html" and len(selected_products)>0:
                        candidate_clickables = [clickable for clickable in candidate_clickables if clickable['href_full'] not in selected_products]
                        no_products = no_products+1 if len(candidate_clickables)==0 else no_products # to identify if all the products were visited

                    random.seed()
                    random.shuffle(candidate_clickables)
                    chosenLink = random.choice(candidate_clickables)

                    if '/products' in chosenLink['href'] and not '?variant' in chosenLink['href']:
                        selected_products.append(chosenLink['href_full'])
                    
                    # assigning the menu state
                    if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                        menuState = 'l1'
                    elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[:]): # CHHANGED l2[1:]
                        menuState = 'l2'
                    else:
                        menuState = 'non'
                    
                    # adding to the visited list
                    visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

                actions.append({
                    'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
                    'candidates': [formatData(clickable.copy()) for clickable in forRecords],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

                # print(chosenLink['href_full'])
                if step < stepitr:
                    clickElement(driver, chosenLink['x'], chosenLink['y'])
                step += 1

            data['actions'] = actions
            data['nClicks'] = len(actions)-1

            # saving to a json file for ML
            fname = 'R_' + str(int(time.time())) + '_' + str(len(actions)-1)
            with open('data/' + fname + '.json', 'w') as f:
                json.dump(data, f)

            # saving to a json file for player
            with open('data_player/' + fname + '.json', 'w') as f:
                json.dump(formatDataForPlayer(data), f)
            
            dpoint+=1 # increasing the iteration number
            print(fname, [action['clicked']['href'] for action in actions[:-1]])
            
            # break
        except Exception as e:
            print("error", e)
            if no_products >= 3:
                selected_products = []
                no_products = 0
        #     # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
        #     # print(chosenLink['href_full'], '\n')
        #     # if actions.__len__() > 0:
        #     #     print([action['href'] for action in actions])
        #     # print('\033[0m', "-----------------------------------------------------")
collectData()

R_1674629267_6 ['#', 'collections/kids.html', 'collections/kids-socks.html', 'kids-socks/products/mini-dalmatian-crew.html', 'mini-dalmatian-crewb771.html?variant=19054881221', 'mini-dalmatian-crew3e77.html?variant=19632431877']
R_1674629329_6 ['#', 'collections/kids.html', 'collections/kids-new-arrivals.html', 'kids-new-arrivals/products/yachtsy-stripe-legging.html', 'yachtsy-stripe-leggingb94b.html?variant=1271671220', 'yachtsy-stripe-legging09a5.html?variant=1271671236']
error list index out of range
R_1674629438_6 ['#', 'collections/kids.html', 'collections/kids-sale.html', 'kids-sale/products/watermelon-crew.html', 'watermelon-crew2f61.html?variant=31730531336239', 'watermelon-crewc7e1.html?variant=39308128878639']
R_1674629498_6 ['#', 'collections/kids.html', 'collections/kids-sale.html', 'kids-sale/products/watermelon-crew.html', 'watermelon-crewc7e1.html?variant=39308128878639', 'watermelon-crew2f61.html?variant=31730531336239']
R_1674629556_6 ['#', 'collections/kids.html', 'co

KeyboardInterrupt: 

In [None]:
# generates only 3 steps
# base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

# def collectData():
#     for stepitr in range(3, 4): #3,16
#         dpoint=1
#         while dpoint <= 100: # no. of recordings needed from one seq. count
#             driver = setup(True)
#             switchToIframe(driver)

#             menuState = 'non'
#             visited = [] # to store visited links in one sequence
#             data = {} # to collect data in one sequence
#             data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
#             data['nClicks'] = stepitr
#             actions = [] # to store clicks

#             try:
#                 # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
#                 for step in range(stepitr + 1): # going through steps in a single data point, adding one more to include the details of the last page
#                     scrollToTheBottom(driver)

#                     candidate_clickables = getAllClickables(driver)
#                     forRecords = candidate_clickables.copy()
#                     # print(candidate_clickables)
                    
#                     # if step <= 4:
#                     # remove everytime
#                     candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
                    
#                     # keep only varients if at least 2 more steps aren't available
#                     if step != 0 and 'products' in actions[-1]['clicked']['href'] and step + 2 >= stepitr:
#                         candidate_clickables = [clickable for clickable in candidate_clickables if 'variant' in clickable['href']]

#                     # adding constraints to click a button
#                     if dpoint <= 0 and step == 0:
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 3)]
#                     elif dpoint <= 100 and step == 0:
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                    
#                     else:
#                         if menuState == 'non':
#                         # eliminating unwanted urls
#                             clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
#                         elif menuState == 'l1':
#                             clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
#                         elif menuState == 'l2':
#                             clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]['clicked']))]
                
#                     # eliminating already visited ones
#                     clickables = [clickable for clickable in clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited)]

#                     # choosing a clickable randomly
#                     chosenLink = random.choice(clickables[:])
                    
#                     # assigning the menu state
#                     if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
#                         menuState = 'l1'
#                     elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
#                         menuState = 'l2'
#                     else:
#                         menuState = 'non'
                    
#                     # adding to the visited list
#                     visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

#                     actions.append({
#                         'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
#                         'candidates': [formatData(clickable.copy()) for clickable in forRecords],
#                         'screenshot': takeScreenshot(driver),
#                         'full_url': getCurrntURLIFrame(driver),
#                         'text': getTextOnPage(driver)
#                         })

#                     # print(chosenLink['href_full'])
#                     if step < stepitr:
#                         clickElement(driver, chosenLink['x'], chosenLink['y'])

#                 data['actions'] = actions

#                 # saving to a json file for ML
#                 fname = 'R_' + str(int(time.time())) + '_' + str(stepitr)
#                 with open('data/' + fname + '.json', 'w') as f:
#                     json.dump(data, f)
        
#                 # saving to a json file for player
#                 with open('data_player/' + fname + '.json', 'w') as f:
#                     json.dump(formatDataForPlayer(data), f)
                
#                 dpoint+=1 # increasing the iteration number
#                 print(fname, [action['clicked']['href'] for action in actions[:-1]])

#                 # break
#             except Exception as e:
#                 print("error", e)
#                 # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
#                 # print(chosenLink['href_full'], '\n')
#                 # if actions.__len__() > 0:
#                 #     print([action['href'] for action in actions])
#                 # print('\033[0m', "-----------------------------------------------------")
# collectData()

In [None]:
driver = setup(9222, False)
switchToIframe(driver)
# elements_a = driver.find_elements(By.TAG_NAME, 'a')

# x = getAllClickables(driver)
candidate_clickables = [(click['href'], click['href_full'].replace(base,""), click['class']) for click in getAllClickables(driver)]
# candidate_clickables = [clickable for clickable in getAllClickables(driver) if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
# [(click['href'], click['href_full'].replace(base,""), click['class']) for click in candidate_clickables]

# data = {
#     'href':'assets/crawled/hansel/hanselfrombasel.com/index.html',
#     'href_full':'assets/crawled/hansel/hanselfrombasel.com/index.html',
#     'x': 0,
#     'y': 0}

# To show an image

In [47]:
# to load an image
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# with open('data/R_1674410870_6.json', 'r') as f:
#     imgFromB64(json.load(f)['actions'][6]['screenshot'])

# Unused

In [73]:
def collectData(n_data):
    driver = setup()
    switchToIframe(driver)

    for i in range(n_data):
        # scrollToTheBottom(driver)
        # clickables = getAllClickables(driver)
        # chosenLink = random.choice(clickables[10:])
        # print(chosenLink)

        clickElement(driver, 414, 3000)
        break

        

# collectData(5)

# To delete paths

In [100]:
feedback = pd.read_csv("Data Quality Checker.csv")
feedback.head()

Unnamed: 0,File,Good,Bad
0,1671539062_3,1.0,
1,1671539104_3,1.0,
2,1671539149_3,,1.0
3,1671539201_3,1.0,
4,1671539251_3,1.0,


In [101]:
# deleting bad ones
feedbackBad = feedback.copy(deep=True)
feedbackBad.File = feedbackBad.File.map(lambda x: 'R_'+ x + '.json')
bad = feedbackBad.loc[feedbackBad.Bad == 1].File.tolist()

videos = 'data_player/'
mlData = 'data/'
for (dirpath, dirnames, filenames) in os.walk(mlData):
    for filename in filenames:
        if filename in bad:
            os.remove(dirpath + filename)
            # print(dirpath, filename)

In [102]:
# adding good ones to a csv
feedback.File = feedback.File.map(lambda x: 'R_'+ x)
good = feedback.loc[feedback.Good == 1].File.tolist()
pd.DataFrame({'token':good}).to_csv('tokens.csv', index=False)