In [1]:
from selenium.webdriver.common.by import By
from SimulatorCommunicator import setup, switchToIframe, scrollToTheBottom, updateURL, clickElement, takeScreenshot
from PIL import Image

import networkx as nx
import random
import matplotlib.pyplot as plt
import re
import json
import base64
import time
import requests
import io
import itertools
import pandas as pd
import os

In [2]:
def getAllClickables(driver):
    elements = []

    elements_a = driver.find_elements(By.TAG_NAME, 'a')
    for element in elements_a:
        if element.is_displayed() and element.is_enabled() and element.rect.__len__() != 0:  # excluding all the hidden links
            # print(element.get_property('attributes')[0].items(), "\n")

            data = {}
            data['x'] = element.rect['x']
            data['y'] = element.rect['y']
            data['width'] = element.rect['width']
            data['height'] = element.rect['height']
            data['outer_html'] = base64Encode(bytes(element.get_attribute('outerHTML'), "utf-8")) 
            data['href'] = element.get_dom_attribute('href') if element.get_dom_attribute('href') else 'None'
            data['href_full'] = element.get_attribute('href') if element.get_attribute('href') else 'None'
            data['class'] = element.get_attribute('class')

            texts = list(itertools.chain(*[text.text.split('\n') for text in element.find_elements(By.XPATH, '..//div')]))
            texts.append(element.text)
            data['text'] = list(set([text for text in texts if len(text) > 0] ))

            # getting closer image(s) if available
            imgs=element.find_elements(By.XPATH, ".//img")
            if len(imgs) > 0:
                for i, img in enumerate(imgs):
                    data['img_'+str(i)] = imgURL2B64(img.get_attribute('src'))

            elements.append(data)
    return elements

# to get the URL of current page on iframe
def getCurrntURLIFrame(driver):
    url = driver.execute_script('return location.href')
    url = url.replace('http://localhost:4200', "")
    return url

# download image from URL and convert to base64
def imgURL2B64(imgURL):
    # downloading image
    img_data = requests.get(imgURL).content
    # encoding
    return base64Encode(img_data)

# convert bytes string to b64 
def base64Encode(data):
    return base64.b64encode(data).decode("utf-8")

# show image from base64
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# format data to store
def formatData(chosenLink):
    data = {}
    data['y'] = chosenLink['y']
    data['y_offset'] = 0
    data['x'] = chosenLink['x']
    data['x_offset'] = 0
    
    if chosenLink['y'] + chosenLink['height'] > 865:
        data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 865
        data['y'] = chosenLink['y'] - data['y_offset']

    if chosenLink['x'] + chosenLink['width'] > 812:
        data['x_offset'] = chosenLink['x'] + chosenLink['width'] - 812
        data['x'] = chosenLink['x'] - data['x_offset']

    # new attributes to the chosen link
    chosenLink['type'] = "click"
    chosenLink['y'] = data['y']
    chosenLink['y_offset'] = data['y_offset']
    chosenLink['x'] = data['x']
    chosenLink['x_offset'] = data['x_offset']
    return chosenLink

# retrieves all the text on the page
def getTextOnPage(driver):
    divs = driver.find_elements(By.TAG_NAME, 'div')
    return list(set(list(itertools.chain(*[[divText for divText in divs[i].text.split('\n') if len(divText) > 0] for i in range(len(divs))]))))

def formatDataForPlayer(inputJson):
    data = {}
    data['url'] = inputJson['url']

    actions = [];
    for in_action in inputJson['actions'][:-1]:
        action = {}
        action['type'] = in_action['clicked']['type']
        action['x'] = in_action['clicked']['x']
        action['y'] = in_action['clicked']['y']
        action['height'] = in_action['clicked']['height']
        action['width'] = in_action['clicked']['width']
        action['x_offset'] = in_action['clicked']['x_offset']
        action['y_offset'] = in_action['clicked']['y_offset']
        action['href'] = in_action['clicked']['href']
        action['href_full'] = in_action['clicked']['href_full']
        action['outer_html'] = in_action['clicked']['outer_html']

        actions.append(action)
        
    data['actions'] = actions
    return data

In [3]:
# checks one against a list
def isSimilar(query, reference, score):
    points = 0
    # relative url match
    if(query[0]) == reference[0]:
        points += 1

    # absolute url match
    if(query[1]) == reference[1]:
        points += 1
  
    # class match  
    if(query[2]) == reference[2]:
        points += 1 

    return points >= score

# check one url
def checkIfIn(query, urlList, score=2):
    for url in urlList:
        if isSimilar(query, url, score):
            return True
    
    return False

In [4]:
l1 = [
     ('premium-collection/index.html', 'theevolutionstore.com/premium-collection/index.html', 'hasSub sf-with-ul'),
     ('preserved-mounted-specimens/index.html', 'theevolutionstore.com/preserved-mounted-specimens/index.html', 'hasSub sf-with-ul'),
     ('skulls-skeletons-anatomy-more/index.html', 'theevolutionstore.com/skulls-skeletons-anatomy-more/index.html', 'hasSub sf-with-ul'),
     ('meteorites-fossils-minerals/index.html', 'theevolutionstore.com/meteorites-fossils-minerals/index.html', 'hasSub sf-with-ul'),
     ('home-decor-art/index.html', 'theevolutionstore.com/home-decor-art/index.html', 'hasSub sf-with-ul'),
     ('jewelry-clothing-accessories/index.html', 'theevolutionstore.com/jewelry-clothing-accessories/index.html', 'hasSub sf-with-ul'),
     ('puzzles-toys-edibles/index.html', 'theevolutionstore.com/puzzles-toys-edibles/index.html', 'hasSub sf-with-ul'),
]

l2 = [
     # premium collection
     ('entomology/index.html', 'theevolutionstore.com/premium-collection/entomology/index.html', ''),
     ('taxidermy-specimens/index.html', 'theevolutionstore.com/premium-collection/taxidermy-specimens/index.html', ''),
     ('fossils/index.html', 'theevolutionstore.com/premium-collection/fossils/index.html', ''),
     ('home-decor/index.html', 'theevolutionstore.com/premium-collection/home-decor/index.html', ''),
     ('minerals-meteorites/index.html', 'theevolutionstore.com/premium-collection/minerals-meteorites/index.html', ''),

     #  preserved and mounted
     ('preserved-specimens/index.html', 'theevolutionstore.com/preserved-mounted-specimens/preserved-specimens/index.html', ''),
     ('sea-life/index.html', 'theevolutionstore.com/preserved-mounted-specimens/sea-life/index.html', ''),
     ('specimens-in-resin/index.html', 'theevolutionstore.com/preserved-mounted-specimens/specimens-in-resin/index.html', ''),
     ('taxidermy/index.html', 'theevolutionstore.com/preserved-mounted-specimens/taxidermy/index.html', ''),
     ('framed-unframed-insects/index.html', 'theevolutionstore.com/preserved-mounted-specimens/framed-unframed-insects/index.html', ''),

     # skills, skeleton, anatomy and more
     ('replica-human-bone/index.html', 'theevolutionstore.com/skulls-skeletons-anatomy-more/replica-human-bone/index.html', ''),
     ('anatomical-models/index.html', 'theevolutionstore.com/skulls-skeletons-anatomy-more/anatomical-models/index.html', ''),
     ('natural-animal-bone/index.html', 'theevolutionstore.com/skulls-skeletons-anatomy-more/natural-animal-bone/index.html', ''),
     ('replica-extinct-animal-bone/index.html', 'theevolutionstore.com/skulls-skeletons-anatomy-more/replica-extinct-animal-bone/index.html', ''),

     # meteorites, fossils and minerals
     ('meteorites/index.html', 'theevolutionstore.com/meteorites-fossils-minerals/meteorites/index.html', ''),
     ('minerals/index.html', 'theevolutionstore.com/meteorites-fossils-minerals/minerals/index.html', ''),
     ('fossils/index.html', 'theevolutionstore.com/meteorites-fossils-minerals/fossils/index.html', ''),

     # home decor
     ('desk-study/globes/index.html', 'theevolutionstore.com/home-decor-art/desk-study/globes/index.html', ''),
     ('../preserved-mounted-specimens/domes/index.html', 'theevolutionstore.com/preserved-mounted-specimens/domes/index.html', ''),
     ('skull-decor/index.html', 'theevolutionstore.com/home-decor-art/skull-decor/index.html', ''),
     ('tabletop-bath/index.html', 'theevolutionstore.com/home-decor-art/tabletop-bath/index.html', ''),
     ('desk-study/index.html', 'theevolutionstore.com/home-decor-art/desk-study/index.html', ''),
     ('posters-prints/index.html', 'theevolutionstore.com/home-decor-art/posters-prints/index.html', ''),
     ('rugs/index.html', 'theevolutionstore.com/home-decor-art/rugs/index.html', ''),
     ('art-artifacts/index.html', 'theevolutionstore.com/home-decor-art/art-artifacts/index.html', ''),

     # jewellery, clothing
     ('jewelry/index.html', 'theevolutionstore.com/jewelry-clothing-accessories/jewelry/index.html', ''),
     ('clothing-accessories/index.html', 'theevolutionstore.com/jewelry-clothing-accessories/clothing-accessories/index.html', ''),

     # puzzles, toys
     ('puzzles-models/index.html', 'theevolutionstore.com/puzzles-toys-edibles/puzzles-models/index.html', ''),
     ('candies-snacks/index.html', 'theevolutionstore.com/puzzles-toys-edibles/candies-snacks/index.html', ''),
     ('other-toys/index.html', 'theevolutionstore.com/puzzles-toys-edibles/other-toys/index.html', ''),
]

l3 = [
     
]

hardToEliminate = [

]

eliminate = [
     ('about-us/index.html', 'theevolutionstore.com/about-us/index.html', ''),
     ('contact-us/index.html', 'theevolutionstore.com/contact-us/index.html', ''),
     ('international-shipping/index.html', 'theevolutionstore.com/international-shipping/index.html', ''),
     ('rentals/index.html', 'theevolutionstore.com/rentals/index.html', ''),
     ('tel:212.343.1114', 'tel:212.343.1114', ''),
     ('logine568.html', 'theevolutionstore.com/logine568.html', ''),
     ('login.html', 'theevolutionstore.com/login.html', 'Signin'),
     ('login830b.html', 'theevolutionstore.com/login830b.html', ''),
     ('cart.html', 'theevolutionstore.com/cart.html', ''),
     ('index.html', 'theevolutionstore.com/index.html', ''),
     ('anatomical-snap-together-kit-heart-original/index.html', 'theevolutionstore.com/anatomical-snap-together-kit-heart-original/index.html', ''),
     ('None', 'None', ''),
     ('None', 'None', ''),
     ('None', 'None', ''),
     ('None', 'None', 'flex-active'),
     ('None', 'None', ''),
     ('preserved-mounted-specimens/framed-unframed-insects/index.html', 'theevolutionstore.com/preserved-mounted-specimens/framed-unframed-insects/index.html', ''),
     ('home-decor-art/posters-prints/seba-prints/index.html', 'theevolutionstore.com/home-decor-art/posters-prints/seba-prints/index.html', ''),
     ('#Tablist1', 'theevolutionstore.com/index.html#Tablist1', ''),
     ('#Tablist2', 'theevolutionstore.com/index.html#Tablist2', ''),
]

pageBottomURLs = [
     ('https://us13.campaign-archive.com/?u=764e88655f97f7c006065cd5a&id=bb1dab1df4', 'https://us13.campaign-archive.com/?u=764e88655f97f7c006065cd5a&id=bb1dab1df4', ''),
     ('https://us13.campaign-archive.com/?u=764e88655f97f7c006065cd5a&id=0b9a1e760f', 'https://us13.campaign-archive.com/?u=764e88655f97f7c006065cd5a&id=0b9a1e760f', ''),
     ('https://www.facebook.com/theevolutionstore', 'https://www.facebook.com/theevolutionstore', ''),
     ('http://instagram.com/theevolutionstore', 'http://instagram.com/theevolutionstore', 'icon-social icon-social-theme icon-instagram'),
     ('https://www.facebook.com/theevolutionstore', 'https://www.facebook.com/theevolutionstore', 'icon-social icon-social-theme icon-facebook'),
     ('https://www.tiktok.com/@theevolutionstore', 'https://www.tiktok.com/@theevolutionstore', 'icon-social icon-social-theme icon-tumblr'),
     ('https://www.youtube.com/channel/UCfaEOsX8SRegQH9KICuNepQ', 'https://www.youtube.com/channel/UCfaEOsX8SRegQH9KICuNepQ', 'icon-social icon-social-theme icon-youtube'),
     ('https://twitter.com/theevolutionnyc', 'https://twitter.com/theevolutionnyc', 'icon-social icon-social-theme icon-twitter'),
     ('https://www.pinterest.com/evolutionstore/', 'https://www.pinterest.com/evolutionstore/', 'icon-social icon-social-theme icon-pinterest'),
     ('https://www.google.com/maps/place/687+Broadway,+New+York,+NY+10012/@40.7290403,-73.9946526,17.47z/data=!4m5!3m4!1s0x89c2599aa0dbbc01:0x40850771c5f6edb!8m2!3d40.7281384!4d-73.9947995', 'https://www.google.com/maps/place/687+Broadway,+New+York,+NY+10012/@40.7290403,-73.9946526,17.47z/data=!4m5!3m4!1s0x89c2599aa0dbbc01:0x40850771c5f6edb!8m2!3d40.7281384!4d-73.9947995', ''),
     ('tel:212.343.1114', 'tel:212.343.1114', ''),
     ('mailto:Info@TheEvolutionStore.com', 'mailto:Info@TheEvolutionStore.com', '')
]

forceClick = [

]

def l3Handle(l2):
     if ("womens" in l2['href']):
          return l3[:7]
     elif("kids" in l2['href']):
          return l3[7:12]
     elif("mens" in l2['href']):
          return l3[12:]

def matchSimilarLinks(candidate_clickables):
     f_list = []
     # add text of second one to the first one
     for i in range(len(candidate_clickables)):
          if candidate_clickables[i]['class'] == 'pname':
               candidate_clickables[i-1]['text'] = candidate_clickables[i]['text']
               f_list.append(candidate_clickables[i-1])
          
          # also adding links to navigate between pages
          if "page=" in candidate_clickables[i]['href']:
               f_list.append(candidate_clickables[i])
     return f_list
     
def checkVisitedTargets(candidate_clickables, trackTargetLinks):
     for target in trackTargetLinks:
          candidate_clickables = [clickable for clickable in candidate_clickables if not (clickable['x'] == target['x'] and clickable['y'] == target['y'])]
     return candidate_clickables

def checkIfTargetPage(candidate_clickables):
     score = 0
     clickable_list = []
     for clickable in candidate_clickables:
          if any(x in clickable['text'] for x in ["Description", "Details", "Reviews"]) and clickable['href'] == "#":
               score += 1
               clickable_list.append(clickable)
     return score >= 2, clickable_list

In [7]:
# start from menu
base = "http://localhost:4200/assets/crawled/es/"

def collectData():
    stepitr = 3
    dpoint=1
    no_products = 0 # flag to identify if all the products are visited
    selected_products = []
    while dpoint <= 85: # no. of recordings needed from one seq. count
        driver = setup(9228, 'assets/crawled/es/index.html', True)
        switchToIframe(driver)

        menuState = 'non'
        visited = [] # to store visited links in one sequence
        data = {} # to collect data in one sequence
        data['url'] = "assets/crawled/es/index.html"
        actions = [] # to store clicks
        nextTrack = False
        nextActiveinDataPoint = False

        trackTargetLinks = []
        try:
            step=0
            while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page

                scrollToTheBottom(driver)
                candidate_clickables = getAllClickables(driver)
                forRecords = candidate_clickables.copy()

                # remove everytime
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate+pageBottomURLs+hardToEliminate, 2)]

                # eliminating already visited ones
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

                # adding constraints to click a button
                if step == 0:
                    candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1[6:7], 2)]
                else:
                    candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 2)]

                move2lvl2 = random.randint(1,10) > 5
                if menuState == 'l1' and move2lvl2:
                    candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2, 2)]
                if (menuState == 'l1' and not move2lvl2) or menuState == 'l2' or nextTrack:
                    candidate_clickables = matchSimilarLinks(candidate_clickables.copy())

                # identifying if in a target page
                isTargetPage, lst = checkIfTargetPage(candidate_clickables.copy())
                if isTargetPage:                    
                    # removing none links
                    candidate_clickables = checkVisitedTargets(lst, trackTargetLinks)
                    # maxItr =  4 if nextActiveinDataPoint else 3
                    step = step-1 if len(candidate_clickables) > 2 and len(actions) <= 3 else step
                
                # to force to go to next page
                if ((menuState=='l1' and not move2lvl2) or (menuState=='l2')) and (not nextActiveinDataPoint and any("page=" in clickable["href_full"] for clickable in candidate_clickables)):
                    candidate_clickables = [clickable for clickable in candidate_clickables if "page=" in clickable['href_full']]

                # if still not in a target page
                if step >= 3 and len(actions) <= 6 and not isTargetPage:
                    candidate_clickables = [clickable for clickable in candidate_clickables if not "page=" in clickable['href_full']]
                    step-=1

                # choosing a clickable randomly
                if step < stepitr:
                    # removing already visited products
                    if len(actions) > 0 and len(selected_products)>0 and not isTargetPage:
                        candidate_clickables = [clickable for clickable in candidate_clickables if clickable['href_full'] not in selected_products]
                        no_products = no_products+1 if len(candidate_clickables)==0 else no_products # to identify if all the products were visited

                    random.seed()
                    random.shuffle(candidate_clickables)
                    chosenLink = random.choice(candidate_clickables)

                    if step >= 1 and not checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1+l2) and not isTargetPage and not "page=" in chosenLink['href']:
                        selected_products.append(chosenLink['href_full'])

                    # assigning the menu state
                    if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                        menuState = 'l1'
                    elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2):
                        menuState = 'l2'
                    else:
                        menuState = 'non'
                    
                    # adding to the visited list
                    if isTargetPage:
                        trackTargetLinks.append({'x':chosenLink['x'], 'y': chosenLink['y']})
                    else:
                        visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))
                    
                    if 'page=' in chosenLink['href_full']:
                        nextTrack = True
                        nextActiveinDataPoint = True
                        step = step-1 if len(actions) <= 5 else step
                    else:
                        nextTrack = False

                actions.append({
                    'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
                    'candidates': [formatData(clickable.copy()) for clickable in forRecords],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

                # print(chosenLink['href_full'])
                if step < stepitr:
                    clickElement(driver, chosenLink['x'], chosenLink['y'])
                step += 1

            data['actions'] = actions
            data['nClicks'] = len(actions)-1

            # saving to a json file for ML
            fname = 'R_' + str(int(time.time())) + '_ES_' + str(len(actions)-1)
            with open('data/' + fname + '.json', 'w') as f:
                json.dump(data, f)

            # # saving to a json file for player
            with open('data_player/' + fname + '.json', 'w') as f:
                json.dump(formatDataForPlayer(data), f)
            
            dpoint+=1 # increasing the iteration number
            print(fname, [action['clicked']['href'] for action in actions[:-1]])
            
            # break
        except Exception as e:
            print("error", e)
            if no_products >= 3:
                selected_products = []
                no_products = 0
collectData()

R_1675220935_ES_5 ['puzzles-toys-edibles/index.html', 'index36c2.html?sort=featured&page=3', '../metal-scorpion-kit/index.html', '#', '#']
R_1675221022_ES_5 ['puzzles-toys-edibles/index.html', 'indexf1d4.html?sort=featured&page=2', 'index1bb6.html?sort=featured&page=1', '../tiny-stegosaurus/index.html', '#']


KeyboardInterrupt: 

In [None]:
driver = setup(9222, 'assets/crawled/es/index.html', False)
switchToIframe(driver)
# elements_a = driver.find_elements(By.TAG_NAME, 'a')

candidate_clickables = getAllClickables(driver)
# candidate_clickables = [(clickable['href'], clickable['href_full'].replace(base,""), clickable['class']) for clickable in getAllClickables(driver)]
candidate_clickables = [(clickable['text'], clickable['href'], clickable['href_full'].replace(base,""), clickable['class']) for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate+pageBottomURLs+l1, 2)]


# data = {
#     'href':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'href_full':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'x': 0,
#     'y': 0}
candidate_clickables

# To show an image

In [108]:
# to load an image
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

with open('data/R_1674999707_ES_6.json', 'r') as f:
    imgFromB64(json.load(f)['actions'][1]['screenshot'])
    # imgFromB64(json.load(f)['actions'][0]['candidates'][31]['img_0'])

# Unused

In [73]:
def collectData(n_data):
    driver = setup()
    switchToIframe(driver)

    for i in range(n_data):
        # scrollToTheBottom(driver)
        # clickables = getAllClickables(driver)
        # chosenLink = random.choice(clickables[10:])
        # print(chosenLink)

        clickElement(driver, 414, 3000)
        break

        

# collectData(5)

# To delete paths

In [100]:
feedback = pd.read_csv("Data Quality Checker.csv")
feedback.head()

Unnamed: 0,File,Good,Bad
0,1671539062_3,1.0,
1,1671539104_3,1.0,
2,1671539149_3,,1.0
3,1671539201_3,1.0,
4,1671539251_3,1.0,


In [101]:
# deleting bad ones
feedbackBad = feedback.copy(deep=True)
feedbackBad.File = feedbackBad.File.map(lambda x: 'R_'+ x + '.json')
bad = feedbackBad.loc[feedbackBad.Bad == 1].File.tolist()

videos = 'data_player/'
mlData = 'data/'
for (dirpath, dirnames, filenames) in os.walk(mlData):
    for filename in filenames:
        if filename in bad:
            os.remove(dirpath + filename)
            # print(dirpath, filename)

In [102]:
# adding good ones to a csv
feedback.File = feedback.File.map(lambda x: 'R_'+ x)
good = feedback.loc[feedback.Good == 1].File.tolist()
pd.DataFrame({'token':good}).to_csv('tokens.csv', index=False)