In [8]:
from selenium.webdriver.common.by import By
from SimulatorCommunicator import setup, switchToIframe, scrollToTheBottom, updateURL, clickElement, takeScreenshot
from PIL import Image

import networkx as nx
import random
import matplotlib.pyplot as plt
import re
import json
import base64
import time
import requests
import io
import itertools
import pandas as pd
import os

In [9]:
def getAllClickables(driver):
    elements = []

    elements_a = driver.find_elements(By.TAG_NAME, 'a')
    for element in elements_a:
        if element.is_displayed() and element.is_enabled() and element.rect.__len__() != 0:  # excluding all the hidden links
            # print(element.get_property('attributes')[0].items(), "\n")

            data = {}
            data['x'] = element.rect['x']
            data['y'] = element.rect['y']
            data['width'] = element.rect['width']
            data['height'] = element.rect['height']
            data['outer_html'] = base64Encode(bytes(element.get_attribute('outerHTML'), "utf-8")) 
            data['href'] = element.get_dom_attribute('href')
            data['href_full'] = element.get_attribute('href')
            data['class'] = element.get_attribute('class')

            texts = list(itertools.chain(*[text.text.split('\n') for text in element.find_elements(By.XPATH, '..//div')]))
            texts.append(element.text)
            data['text'] = list(set([text for text in texts if len(text) > 0] ))

            # getting closer image(s) if available
            imgs=element.find_elements(By.XPATH, ".//img")
            if len(imgs) > 0:
                for i, img in enumerate(imgs):
                    data['img_'+str(i)] = imgURL2B64(img.get_attribute('src'))

            elements.append(data)
    return elements

# to get the URL of current page on iframe
def getCurrntURLIFrame(driver):
    url = driver.execute_script('return location.href')
    url = url.replace('http://localhost:4200', "")
    return url

# download image from URL and convert to base64
def imgURL2B64(imgURL):
    # downloading image
    img_data = requests.get(imgURL).content
    # encoding
    return base64Encode(img_data)

# convert bytes string to b64 
def base64Encode(data):
    return base64.b64encode(data).decode("utf-8")

# show image from base64
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

# format data to store
def formatData(chosenLink):
    data = {}
    data['y'] = chosenLink['y']
    data['y_offset'] = 0
    data['x'] = chosenLink['x']
    data['x_offset'] = 0
    
    if chosenLink['y'] + chosenLink['height'] > 865:
        data['y_offset'] = chosenLink['y'] + chosenLink['height'] - 865
        data['y'] = chosenLink['y'] - data['y_offset']

    if chosenLink['x'] + chosenLink['width'] > 812:
        data['x_offset'] = chosenLink['x'] + chosenLink['width'] - 812
        data['x'] = chosenLink['x'] - data['x_offset']

    # new attributes to the chosen link
    chosenLink['type'] = "click"
    chosenLink['y'] = data['y']
    chosenLink['y_offset'] = data['y_offset']
    chosenLink['x'] = data['x']
    chosenLink['x_offset'] = data['x_offset']
    return chosenLink

# retrieves all the text on the page
def getTextOnPage(driver):
    divs = driver.find_elements(By.TAG_NAME, 'div')
    return list(set(list(itertools.chain(*[[divText for divText in divs[i].text.split('\n') if len(divText) > 0] for i in range(len(divs))]))))

def formatDataForPlayer(inputJson):
    data = {}
    data['url'] = inputJson['url']

    actions = [];
    for in_action in inputJson['actions'][:-1]:
        action = {}
        action['type'] = in_action['clicked']['type']
        action['x'] = in_action['clicked']['x']
        action['y'] = in_action['clicked']['y']
        action['height'] = in_action['clicked']['height']
        action['width'] = in_action['clicked']['width']
        action['x_offset'] = in_action['clicked']['x_offset']
        action['y_offset'] = in_action['clicked']['y_offset']
        action['href'] = in_action['clicked']['href']
        action['href_full'] = in_action['clicked']['href_full']
        action['outer_html'] = in_action['clicked']['outer_html']

        actions.append(action)
        
    data['actions'] = actions
    return data

In [10]:
# checks one against a list
def isSimilar(query, reference, score):
    points = 0
    # relative url match
    if(query[0]) == reference[0]:
        points += 1

    # absolute url match
    if(query[1]) == reference[1]:
        points += 1
  
    # class match  
    if(query[2]) == reference[2]:
        points += 1 

    return points >= score

# check one url
def checkIfIn(query, urlList, score=2):
    for url in urlList:
        if isSimilar(query, url, score):
            return True
    
    return False

In [11]:
l1 = [
     ('collections/placemats.html', 'collections/placemats.html', ''),
     ('collections/notepads.html', 'collections/notepads.html', ''),
     ('collections/note-cards.html', 'collections/note-cards.html', ''),
     # ('collections/gift-labels.html', 'collections/gift-labels.html', ''),
     # ('collections/name-labels.html', 'collections/name-labels.html', ''),
     # ('collections/return-address-labels.html', 'collections/return-address-labels.html', ''),
     # ('collections/throw-pillows.html', 'collections/throw-pillows.html', ''),
     # ('collections/blankets.html', 'collections/blankets.html', ''),
     # ('collections/tote-bags.html', 'collections/tote-bags.html', ''),
     # ('collections/ceramic-mugs.html', 'collections/ceramic-mugs.html', ''),
     # ('collections/camp-mugs.html', 'collections/camp-mugs.html', ''),
     # ('collections/pillowcases.html', 'collections/pillowcases.html', ''),
     # ('collections/puzzles.html', 'collections/puzzles.html', ''),
     # ('collections/chore-charts.html', 'collections/chore-charts.html', ''),
     # ('collections/teacher-gifts.html', 'collections/teacher-gifts.html', ''),
]

l2 = [

]

l3 = [

]

eliminate = [
     ('account/login.html', 'account/login.html', ''),
     ('account/register.html', 'account/register.html', ''),
     ('cart.html', 'cart.html', ''),
     ('index.html', 'index.html', ''),
     ('https://www.facebook.com/sarahandabraham', 'https://www.facebook.com/sarahandabraham', 'facebook'),
     ('https://twitter.com/sarahandabraham', 'https://twitter.com/sarahandabraham', 'twitter'),
     ('http://www.pinterest.com/sarahandabraham/', 'http://www.pinterest.com/sarahandabraham/', 'pinterest'),
     ('https://www.instagram.com/sarahandabraham/', 'https://www.instagram.com/sarahandabraham/', 'instagram'),
     ('pages/about-us.html', 'pages/about-us.html', ''),
     ('pages/faq.html', 'pages/faq.html', ''),
     ('pages/contact.html', 'pages/contact.html', ''),
     ('http://sarahandabraham.com/blog/', 'http://sarahandabraham.com/blog/', ''),
     ('collections/all.html', 'collections/all.html', ''),
     ('http://www.aeolidia.com/', 'http://www.aeolidia.com/', ''),
     ('', '', 'bx-prev'),
     ('', '', 'bx-next')
]

pageBottomURLs = [
     
]

forceClick = [

]

def l3Handle(l2):
     if ("womens" in l2['href']):
          return l3[:7]
     elif("kids" in l2['href']):
          return l3[7:12]
     elif("mens" in l2['href']):
          return l3[12:]

In [12]:
# # removing stepitr/code used to generate lot of data start from middle button
# base = "http://localhost:4200/assets/crawled/sna/www.sarahandabraham.com/index.html"

# def collectData():
#     stepitr = 3
#     dpoint=1
#     no_products = 0 # flag to identify if all the products are visited
#     selected_products = []
#     while dpoint <= 500: # no. of recordings needed from one seq. count
#         driver = setup(9222, True)
#         switchToIframe(driver)

#         menuState = 'non'
#         visited = [] # to store visited links in one sequence
#         data = {} # to collect data in one sequence
#         data['url'] = "assets/crawled/sna/www.sarahandabraham.com/index.html"
#         actions = [] # to store clicks

#         try:
#             # steps = random.choice([x for x  in range(3,8)]) # generating number of steps
#             step=0
#             while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page
#             # for step in range(stepitr + 1):
#                 scrollToTheBottom(driver)

#                 candidate_clickables = getAllClickables(driver)
#                 forRecords = candidate_clickables.copy()

#                 # remove everytime
#                 candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
                
#                 # eliminating already visited ones
#                 candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

#                 # keep only varients if at least 2 more steps aren't available
#                 if step != 0 and any(x in actions[-1]['clicked']['href'] for x in ['products', 'variant']) and step + 2 >= stepitr:
#                     candidate_clickables = [clickable for clickable in candidate_clickables if '?variant' in clickable['href']]
#                     step = step-1 if len(candidate_clickables) >= 2 else step
#                     step = stepitr if len(actions) == 6 and len(candidate_clickables) > 0 else step
                
#                 if step==1 and actions[-1]['clicked']['href'] in "/index.html": # to click on a product in the 1st step
#                     candidate_clickables = [clickable for clickable in candidate_clickables if '/products' in clickable['href']]

#                 # adding constraints to click a button
#                 if dpoint <= 1000 and step == 0:
#                     clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), forceClick, 3)]
#                 else:
#                     if menuState == 'non':
#                     # eliminating unwanted urls
#                         clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]
#                     elif menuState == 'l1':
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l2)]
#                     elif menuState == 'l2':
#                         clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l3Handle(actions[-1]['clicked']))]

#                 # choosing a clickable randomly
#                 if step < stepitr:
#                     # removing already visited products
#                     if step==1 and actions[-1]['clicked']['href'] in "/index.html" and len(selected_products)>0:
#                         clickables = [clickable for clickable in clickables if clickable['href_full'] not in selected_products]
#                         no_products = no_products+1 if len(clickables)==0 else 0 # to identify if all the products were visited

#                     random.seed()
#                     random.shuffle(clickables)
#                     random.shuffle(clickables)
#                     chosenLink = random.choice(clickables)

#                     if step==1 and '/products' in chosenLink['href']:
#                         selected_products.append(chosenLink['href_full'])
                    
#                     # assigning the menu state
#                     if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
#                         menuState = 'l1'
#                     elif checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l2[1:]):
#                         menuState = 'l2'
#                     else:
#                         menuState = 'non'
                    
#                     # adding to the visited list
#                     visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

#                 actions.append({
#                     'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
#                     'candidates': [formatData(clickable.copy()) for clickable in forRecords],
#                     'screenshot': takeScreenshot(driver),
#                     'full_url': getCurrntURLIFrame(driver),
#                     'text': getTextOnPage(driver)
#                     })

#                 # print(chosenLink['href_full'])
#                 if step < stepitr:
#                     clickElement(driver, chosenLink['x'], chosenLink['y'])
#                 step += 1

#             data['actions'] = actions
#             data['nClicks'] = len(actions)-1

#             # saving to a json file for ML
#             fname = 'R_' + str(int(time.time())) + '_' + str(len(actions)-1)
#             with open('data/' + fname + '.json', 'w') as f:
#                 json.dump(data, f)

#             # saving to a json file for player
#             with open('data_player/' + fname + '.json', 'w') as f:
#                 json.dump(formatDataForPlayer(data), f)
            
#             dpoint+=1 # increasing the iteration number
#             print(fname, [action['clicked']['href'] for action in actions[:-1]])

#             # break
#         except Exception as e:
#             print("error", e)
#             if no_products == 3:
#                 selected_products.clear()
#         #     # print('\033[91m', 'step_itr:', stepitr, 'dpoint:', dpoint)
#         #     # print(chosenLink['href_full'], '\n')
#         #     # if actions.__len__() > 0:
#         #     #     print([action['href'] for action in actions])
#         #     # print('\033[0m', "-----------------------------------------------------")
# collectData()

In [13]:
# start from menu
base = "http://localhost:4200/assets/crawled/sna/www.sarahandabraham.com/"

def collectData():
    stepitr = 2
    dpoint=1
    no_products = 0 # flag to identify if all the products are visited
    selected_products = []
    while dpoint <= 500: # no. of recordings needed from one seq. count
        driver = setup(9222, 'assets/crawled/sna/www.sarahandabraham.com/index.html', True)
        switchToIframe(driver)

        menuState = 'non'
        visited = [] # to store visited links in one sequence
        data = {} # to collect data in one sequence
        data['url'] = "assets/crawled/sna/www.sarahandabraham.com/index.html"
        actions = [] # to store clicks

        try:
            step=0
            while step <= stepitr: # going through steps in a single data point, adding one more to include the details of the last page

                scrollToTheBottom(driver)
                candidate_clickables = getAllClickables(driver)
                forRecords = candidate_clickables.copy()

                # remove everytime
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), eliminate, 2)]

                # eliminating already visited ones
                candidate_clickables = [clickable for clickable in candidate_clickables if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), visited, 2)]

                # keep only varients if at least 2 more steps aren't available
                if step != 0 and len(actions)>=2 and any(x in actions[-1]['clicked']['href_full'] for x in ['products']) and step + 1 >= stepitr:
                    candidate_clickables = [clickable for clickable in candidate_clickables if 'products' in clickable['href_full']]
                    step = step-1 if len(candidate_clickables) >= 1 else step
                    step = stepitr if len(actions) == 6 and len(candidate_clickables) > 0 else step 

                # adding constraints to click a button
                if step == 0:
                    candidate_clickables = [clickable for clickable in candidate_clickables if clickable['x'] < 116]
                    candidate_clickables = [clickable for clickable in candidate_clickables if checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), l1, 2)]
                elif len(actions) >= 1:
                    candidate_clickables = [clickable for clickable in candidate_clickables if 'products' in clickable['href_full']]


                # choosing a clickable randomly
                if step < stepitr:
                    # removing already visited products
                    if step==1 and len(actions) == 1 and len(selected_products)>0:
                        candidate_clickables = [clickable for clickable in candidate_clickables if clickable['href_full'] not in selected_products]
                        no_products = no_products+1 if len(candidate_clickables)==0 else no_products # to identify if all the products were visited

                    random.seed()
                    random.shuffle(candidate_clickables)
                    chosenLink = random.choice(candidate_clickables)

                    if step == 1 and '/products' in chosenLink['href_full']:
                        selected_products.append(chosenLink['href_full'])
                    
                    # assigning the menu state
                    if checkIfIn((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']), l1):
                        menuState = 'l1'
                    else:
                        menuState = 'non'
                    
                    # adding to the visited list
                    visited.append((chosenLink['href'], chosenLink['href_full'].replace(base,""), chosenLink['class']))

                actions.append({
                    'clicked': formatData(chosenLink.copy()) if step < stepitr else None,
                    'candidates': [formatData(clickable.copy()) for clickable in forRecords],
                    'screenshot': takeScreenshot(driver),
                    'full_url': getCurrntURLIFrame(driver),
                    'text': getTextOnPage(driver)
                    })

                # print(chosenLink['href_full'])
                if step < stepitr:
                    clickElement(driver, chosenLink['x'], chosenLink['y'])
                step += 1

            data['actions'] = actions
            data['nClicks'] = len(actions)-1

            # saving to a json file for ML
            fname = 'R_' + str(int(time.time())) + '_SA_' + str(len(actions)-1)
            with open('data/' + fname + '.json', 'w') as f:
                json.dump(data, f)

            # saving to a json file for player
            with open('data_player/' + fname + '.json', 'w') as f:
                json.dump(formatDataForPlayer(data), f)
            
            dpoint+=1 # increasing the iteration number
            print(fname, [action['clicked']['href'] for action in actions[:-1]])
            
            # break
        except Exception as e:
            print("error", e)
            if no_products >= 3:
                selected_products = []
                no_products = 0
collectData()

R_1674843575_SA_2 ['collections/notepads.html', 'notepads/products/personalized-green-chevron-notepad.html']
R_1674843612_SA_3 ['collections/placemats.html', 'placemats/products/olliegraphic-placemat-superhero-girl.html', '#']
R_1674843658_SA_5 ['collections/note-cards.html', 'note-cards/products/personalized-note-cards-bird.html', '#1', '#3', '#2']
R_1674843699_SA_4 ['collections/placemats.html', 'placemats/products/personalized-kids-placemat-t-rex.html', '#1', '#2']
R_1674843724_SA_2 ['collections/notepads.html', 'notepads/products/a-note-from-mom-floral-bouquet.html']
R_1674843761_SA_4 ['collections/placemats.html', 'placemats/products/personalized-kids-placemat-horse-and-cow.html', '#1', '#2']
R_1674843785_SA_2 ['collections/notepads.html', 'notepads/products/handwriting-practice-notepad-green.html']
R_1674843807_SA_2 ['collections/notepads.html', 'notepads/products/personalized-teacher-notepad-blue-school-uniforms.html']
R_1674843841_SA_4 ['collections/note-cards.html', 'note-card

In [None]:
driver = setup(9222, 'assets/crawled/sna/www.sarahandabraham.com/index.html', False)
switchToIframe(driver)
# elements_a = driver.find_elements(By.TAG_NAME, 'a')

# x = getAllClickables(driver)
candidate_clickables = [(click['href'], click['href_full'].replace(base,""), click['class']) for click in getAllClickables(driver)]
# candidate_clickables = [clickable for clickable in getAllClickables(driver) if not checkIfIn((clickable['href'], clickable['href_full'].replace(base,""), clickable['class']), pageBottomURLs, 2)]
# [(click['href'], click['href_full'].replace(base,""), click['class']) for click in candidate_clickables]

# data = {
#     'href':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'href_full':'assets/crawled/sna/www.sarahandabraham.com/index.html',
#     'x': 0,
#     'y': 0}
candidate_clickables

# To show an image

In [15]:
# to load an image
def imgFromB64(imgb64):
    # reconstructing the image by decoding
    img64dec = base64.b64decode(imgb64)
    Image.open(io.BytesIO(img64dec)).show()

with open('data/R_1674857010_SA_2.json', 'r') as f:
    imgFromB64(json.load(f)['actions'][1]['screenshot'])
    # imgFromB64(json.load(f)['actions'][1]['candidates'][31]['img_0'])

# Unused

In [73]:
def collectData(n_data):
    driver = setup()
    switchToIframe(driver)

    for i in range(n_data):
        # scrollToTheBottom(driver)
        # clickables = getAllClickables(driver)
        # chosenLink = random.choice(clickables[10:])
        # print(chosenLink)

        clickElement(driver, 414, 3000)
        break

        

# collectData(5)

# To delete paths

In [100]:
feedback = pd.read_csv("Data Quality Checker.csv")
feedback.head()

Unnamed: 0,File,Good,Bad
0,1671539062_3,1.0,
1,1671539104_3,1.0,
2,1671539149_3,,1.0
3,1671539201_3,1.0,
4,1671539251_3,1.0,


In [101]:
# deleting bad ones
feedbackBad = feedback.copy(deep=True)
feedbackBad.File = feedbackBad.File.map(lambda x: 'R_'+ x + '.json')
bad = feedbackBad.loc[feedbackBad.Bad == 1].File.tolist()

videos = 'data_player/'
mlData = 'data/'
for (dirpath, dirnames, filenames) in os.walk(mlData):
    for filename in filenames:
        if filename in bad:
            os.remove(dirpath + filename)
            # print(dirpath, filename)

In [102]:
# adding good ones to a csv
feedback.File = feedback.File.map(lambda x: 'R_'+ x)
good = feedback.loc[feedback.Good == 1].File.tolist()
pd.DataFrame({'token':good}).to_csv('tokens.csv', index=False)