In [29]:
# !pip install selenium
# !pip install networkx
# !pip install matplotlib
# !pip install webdriver-manager

In [1]:
from selenium.webdriver.common.by import By
from SimulatorCommunicator import setup, switchToIframe, scrollToTheBottom, updateURL, clickElement, takeScreenshot
from PIL import Image

import networkx as nx
import random
import matplotlib.pyplot as plt
import re
import json
import base64
import time
import requests
import io
import itertools

In [60]:
def getAllClickables(driver):
    elements = []

    elements_a = driver.find_elements(By.TAG_NAME, 'a')
    for element in elements_a:
        if element.is_displayed() and element.is_enabled() and element.rect.__len__() != 0:  # excluding all the hidden links
            # print(element.get_property('attributes')[0].items(), "\n")

            data = {}
            data['x'] = element.rect['x']
            data['y'] = element.rect['y']
            data['width'] = element.rect['width']
            data['height'] = element.rect['height']
            data['outer_html'] = base64Encode(bytes(element.get_attribute('outerHTML'), "utf-8")) 
            data['href'] = element.get_dom_attribute('href')
            data['href_full'] = element.get_attribute('href')
            data['class'] = element.get_attribute('class')
            data['text'] = list(itertools.chain(*[text.text.split('\n') for text in element.find_elements(By.XPATH, './/div') if len(text.text) > 0]))
            
            # getting closer image(s) if available
            imgs=element.find_elements(By.XPATH, ".//img")
            if len(imgs) > 0:
                for i, img in enumerate(imgs):
                    data['img_'+str(i)] = imgURL2B64(img.get_attribute('src'))

            elements.append(data)
    return elements

# to get the URL of current page on iframe
def getCurrntURLIFrame(driver):
    url = driver.execute_script('return window.location.href')
    url = url.replace('http://localhost:4200', "")
    return url

# download image from URL and convert to base64
def imgURL2B64(imgURL):
    # downloading image
    img_data = requests.get(imgURL).content
    # encoding
    return base64Encode(img_data)

# convert bytes string to b64 
def base64Encode(data):
    return base64.b64encode(data).decode("utf-8")

# show image from base64
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# format data to store
def formatData(chosenLink):
    data = {}
    data['y'] = chosenLink['y'] + chosenLink['height']
    data['y_offset'] = 0

    if (chosenLink['y'] + chosenLink['height']) > 720:
        data['y'] = chosenLink['y'] - (chosenLink['y'] - 720 - chosenLink['height'])
        data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 720

    # new attributes to the chosen link
    chosenLink['type'] = "click"
    chosenLink['y'] = data['y']
    chosenLink['x_offset'] = 0
    chosenLink['y_offset'] = data['y_offset']

    return chosenLink

# retrieves all the text on the page
def getTextOnPage(driver):
    divs = driver.find_elements(By.TAG_NAME, 'div')
    return list(set(list(itertools.chain(*[[divText for divText in divs[i].text.split('\n') if len(divText) > 0] for i in range(len(divs))]))))

In [3]:
x = """<a href="collections/womens-new-arrivals.html" class="button
                        button-medium
                        button-white
                        txt-size-2
                        txt-tracked-two-point">
                 Shop New Arrivals
              </a>"""
base64.b64encode(bytes(x, "utf-8")).decode("utf-8")

'PGEgaHJlZj0iY29sbGVjdGlvbnMvd29tZW5zLW5ldy1hcnJpdmFscy5odG1sIiBjbGFzcz0iYnV0dG9uCiAgICAgICAgICAgICAgICAgICAgICAgIGJ1dHRvbi1tZWRpdW0KICAgICAgICAgICAgICAgICAgICAgICAgYnV0dG9uLXdoaXRlCiAgICAgICAgICAgICAgICAgICAgICAgIHR4dC1zaXplLTIKICAgICAgICAgICAgICAgICAgICAgICAgdHh0LXRyYWNrZWQtdHdvLXBvaW50Ij4KICAgICAgICAgICAgICAgICBTaG9wIE5ldyBBcnJpdmFscwogICAgICAgICAgICAgIDwvYT4='

In [3]:
# checks one against a list
def isSimilar(query, reference, score):
    points = 0
    # relative url match
    if(query[0]) == reference[0]:
        points += 1

    # absolute url match
    if(query[1]) == reference[1]:
        points += 1
  
    # class match  
    if(query[2]) == reference[2]:
        points += 1 

    return points >= score

# check one url
def checkIfIn(query, urlList, score=2):
    for url in urlList:
        if isSimilar(query, url, score):
            return True
    
    return False

In [4]:
l1 = [
     ('#', 'index.html#', 'nav-link\n                js-top-drawer-click')
]

l2 = [
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'topnav-mobile-link'),
     ('collections/womens.html', 'collections/womens.html', 'topnav-mobile-link'), 
     ('collections/kids.html', 'collections/kids.html', 'topnav-mobile-link'),
     ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
]

l3 = [
     ('collections/womens.html', 'collections/womens.html', 'topnav-mobile-link'),
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'topnav-mobile-link'),
     ('collections/womens-socks.html', 'collections/womens-socks.html', 'topnav-mobile-link'),
     ('collections/womens-tights-and-leggings.html', 'collections/womens-tights-and-leggings.html', 'topnav-mobile-link'),
     ('collections/womens-clothing.html', 'collections/womens-clothing.html', 'topnav-mobile-link'),
     ('collections/womens-bags.html', 'collections/womens-bags.html', 'topnav-mobile-link'),
     ('collections/womens-sale.html', 'collections/womens-sale.html', 'topnav-mobile-link'),

     ('collections/kids.html', 'collections/kids.html', 'topnav-mobile-link'),
     ('collections/kids-new-arrivals.html', 'collections/kids-new-arrivals.html', 'topnav-mobile-link'),
     ('collections/kids-socks.html', 'collections/kids-socks.html', 'topnav-mobile-link'),
     ('collections/kids-tights-and-leggings.html', 'collections/kids-tights-and-leggings.html', 'topnav-mobile-link'),
     ('collections/kids-sale.html', 'collections/kids-sale.html', 'topnav-mobile-link'),

     ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
     ('collections/mens-new-arrivals-1.html', 'collections/mens-new-arrivals-1.html', 'topnav-mobile-link'),
     ('collections/mens-socks.html', 'collections/mens-socks.html', 'topnav-mobile-link'),
     ('collections/mens-sale.html', 'collections/mens-sale.html', 'topnav-mobile-link'),
]

eliminate = [
     ('collections/mens.html', 'collections/mens.html', 'topnav-mobile-link'),
     ('collections/womens-tights-and-leggings.html', 'collections/womens-tights-and-leggings.html', 'subnav-link\n                        '),
     ('collections/womens-bags.html', 'collections/womens-bags.html', 'subnav-link\n                        '),
     ('collections/womens-gift-card.html', 'collections/womens-gift-card.html', 'subnav-link\n                        '),
     ('collections/womens-sale.html', 'collections/womens-sale.html', 'subnav-link\n                        '),
     ('collections/womens-sporty-series.html', 'collections/womens-sporty-series.html', 'subnav-link\n                        '),
     ('collections/womens-conversational-crews.html', 'collections/womens-conversational-crews.html', 'subnav-link\n                        '),
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', ''),
     ('collections/kids-gift-card.html', 'collections/kids-gift-card.html', 'subnav-link\n                        '),
     ('collections/kids-sale.html', 'collections/kids-sale.html', 'subnav-link\n                        '),
     ('collections/kids-new-arrivals.html', 'collections/kids-new-arrivals.html', ''),
     ('collections/mens-new-arrivals-1.html', 'collections/mens-new-arrivals-1.html', ''),
     ('https://instagram.com/hanselfrombasel/', 'https://instagram.com/hanselfrombasel/', 'icon-instagram\n                    footer-link\n                    footer-link-social'),
     ('https://www.facebook.com/HanselfromBasel', 'https://www.facebook.com/HanselfromBasel', 'icon-facebook\n                    footer-link\n                    footer-link-social'),
     ('https://www.pinterest.com/hanselfrombasel/', 'https://www.pinterest.com/hanselfrombasel/', 'icon-pinterest\n                    footer-link\n                    footer-link-social'),

     ('cart.html', 'cart.html', 'nav-link\n                nav-link-mobile\n                js-drawer-toggle'),
]

forceClick = [
     ('collections/womens-new-arrivals.html', 'collections/womens-new-arrivals.html', 'button\n                        button-medium\n                        button-white\n                        txt-size-2\n                        txt-tracked-two-point'),
]

def l3Handle(l2):
     if ("womens" in l2['href']):
          return l3[:7]
     elif("kids" in l2['href']):
          return l3[7:12]
     elif("mens" in l2['href']):
          return l3[12:]

     

In [5]:
# # base code
# base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

# def collectData():
#     for stepitr in range(3,8): #3,16
#         dpoint=1
#         while dpoint <= 1: # no. of recordings needed from one seq. count
#             driver = setup(True)
#             switchToIframe(driver)

#             menuState = 'non'
#             visited = [] # to store visited links in one sequence
#             data = {} # to collect data in one sequence
#             data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
#             actions = [] # to store clicks

#             try:
#                 # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
#                 for step in range(stepitr): # going through steps in a single data point
#                     scrollToTheBottom(driver)

#                      # adding constraints to click a button
#                     if dpoint <= 4 and step == 0:
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 3)]
#                     elif dpoint <= 8 and step == 0:
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                    
#                     else:
#                         if menuState == 'non':
#                         # eliminating unwanted urls
#                             clickables = [clickable for clickable in getAllClickables(driver) if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
#                         elif menuState == 'l1':
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
#                         elif menuState == 'l2':
#                             clickables = [clickable for clickable in getAllClickables(driver) if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]))]

#                     # eliminating already visited ones
#                     clickables = [clickable for clickable in clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited)]

#                     # choosing a clickable randomly
#                     chosenLink = random.choice(clickables[:])

#                     if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
#                         menuState = 'l1'
#                     elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
#                         menuState = 'l2'
#                     else:
#                         menuState = 'non'

#                     # adding to the visited list
#                     visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

#                     # print(chosenLink['href_full'])
#                     clickElement(driver, chosenLink['x'], chosenLink['y'])
#                     actions.append(formatData(chosenLink))
                
#                 data['actions'] = actions
#                 # saving to a json file
#                 fname = str(int(time.time())) + '_' + str(stepitr)
#                 with open('data/R_' + fname + '.json', 'w') as f:
#                     json.dump(data, f)
                
#                 dpoint+=1 # increasing the iteration number
#                 print(fname, [action['href'] for action in actions])

#                 # break
#             except Exception as e:
#                 pass
#                 # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
#                 # print(chosenLink['href_full'], '\n')
#                 # if actions.__len__() > 0:
#                 #     print([action['href'] for action in actions])
#                 # print('\033[0m', "-----------------------------------------------------")

# collectData()

In [6]:
base = "http://localhost:4200/assets/crawled/hansel/hanselfrombasel.com/"

def collectData():
    for stepitr in range(2, 3): #3,16
        dpoint=1
        while dpoint <= 1: # no. of recordings needed from one seq. count
            driver = setup(True)
            switchToIframe(driver)

            menuState = 'non'
            visited = [] # to store visited links in one sequence
            data = {} # to collect data in one sequence
            data['url'] = "assets/crawled/hansel/hanselfrombasel.com/index.html"
            actions = [] # to store clicks

            # try:
            # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
            for step in range(stepitr): # going through steps in a single data point
                print("lol")
                scrollToTheBottom(driver)

                candidate_clickables = getAllClickables(driver)
                # print(candidate_clickables)

                    # adding constraints to click a button
                if dpoint <= 4 and step == 0:
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 3)]
                elif dpoint <= 8 and step == 0:
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
                
                else:
                    if menuState == 'non':
                    # eliminating unwanted urls
                        clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
                    elif menuState == 'l1':
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
                    elif menuState == 'l2':
                        clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]))]
            
                # eliminating already visited ones
                clickables = [clickable for clickable in clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited)]

                # choosing a clickable randomly
                chosenLink = random.choice(clickables[:])
                
                # assigning the menu state
                if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                    menuState = 'l1'
                elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
                    menuState = 'l2'
                else:
                    menuState = 'non'
                
                # adding to the visited list
                visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

                # print(chosenLink['href_full'])
                clickElement(driver, chosenLink['x'], chosenLink['y'])

                actions.append({
                    'clicked': formatData(chosenLink),
                    'candidates': [formatData(clickable) for clickable in candidate_clickables],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

            data['actions'] = actions

            # saving to a json file
            fname = str(int(time.time())) + '_' + str(stepitr)
            with open('data/R_' + fname + '.json', 'w') as f:
                json.dump(data, f)
            
            dpoint+=1 # increasing the iteration number
            # print(fname, [action['clicked']['href'] for action in actions])

                # break
            # except Exception as e:
                # print("error", e)
                # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
                # print(chosenLink['href_full'], '\n')
                # if actions.__len__() > 0:
                #     print([action['href'] for action in actions])
                # print('\033[0m', "-----------------------------------------------------")
# collectData()

In [61]:
driver = setup(False)
switchToIframe(driver)
getTextOnPage(driver)

# x = getAllClickables(driver)
# [(click['href'], click['href_full'].replace(base,""), click['class']) for click in getAllClickables(driver)]

# data = {
#     'href':'assets/crawled/hansel/hanselfrombasel.com/index.html',
#     'href_full':'assets/crawled/hansel/hanselfrombasel.com/index.html',
#     'x': 0,
#     'y': 0}


['Stockists',
 'Queen Of Hearts Crew',
 'Umbrella Short Crew',
 'Bemberg Rib Short Crew',
 'Trouser Crew',
 '$24 $16.80',
 'Seltzer Short Crew',
 'Kismet Crew',
 '$41',
 "Men's",
 'Wilhelm Crew',
 'LOADING MORE',
 'ASSISTANCE',
 "Women's New Arrivals",
 'ABOUT US',
 '$14',
 'Half + Half Crew',
 'Sustainability Rules Crew',
 'Jane Crew',
 'Wiggly Sheer Short Crew',
 'Martina Pant',
 'Anise Sheer Crew',
 '© Hansel from Basel 2022',
 'Amalfi Crew',
 'Lucid Crew',
 'Swirl Crew',
 'Fundamental Liner 2pr Pack',
 'Rosie Crew',
 'RESTOCKED',
 'Size Guide',
 'Penny Apron Dress',
 '$232',
 'FREE SHIPPING ON US ORDERS $75+',
 'Margaret Dress',
 '$43',
 'Sci Fi Sporty Tall Crew',
 'Conversational',
 '$36',
 '카드에 연결된 주소입니다',
 'Ulla Crew',
 'Press',
 'Cecilia Crew',
 'SALE',
 'Utility Slub Crew',
 'Baby Strawberry Sheer Short Crew',
 'Collegiate Hearts Crew',
 '$21 $16.80',
 'Phys Ed Crew',
 'Bandana Patched Crew',
 '$23',
 'ONLY A FEW LEFT!',
 'Granny Crew',
 'Facebook',
 'Rainbow Brite Crew',
 'MO

# Unused

In [73]:
def collectData(n_data):
    driver = setup()
    switchToIframe(driver)

    for i in range(n_data):
        # scrollToTheBottom(driver)
        # clickables = getAllClickables(driver)
        # chosenLink = random.choice(clickables[10:])
        # print(chosenLink)

        clickElement(driver, 414, 3000)
        break

        

# collectData(5)