# Import package

In [3]:
%time
import time
# create a browser instance
from selenium import webdriver
# emulate keyboard inputs
from selenium.webdriver.common.keys import Keys
# creatinga single browser instance
import selenium.webdriver.firefox.service as service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotInteractableException
# WebDriverWait and EC to allow waiting for element to load on page
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# module to search for elements using xpaths
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
# exception handling
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# quick clicking and scrolling
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# searching of html with "find()"
from bs4 import BeautifulSoup
import pandas as pd
import sys
import math
import os                       # file saving
import datetime
import re
# for launching vlc
import subprocess
from subprocess import PIPE
import shlex
import os.path
import unidecode                # to remove accents

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


# Selenium Bot Class

### make sure that "chromedriver" and "geckodriver" are in this directory

In [4]:
%time
class selenium_bot():
    """
    Interactable bot, that parses outlook files
    """
    def __init__(self, browser, timeout, save_period, url, page_loaded_xpath):
        """
        __ Parameters __
        [str] browser: "Firefox" or "Chrome"
        [float] timeout: how long to wait for responses from webpage
        [save_period] float: time in seconds to create backup of parsed data
        [str] url: url bot starts off at
        [str] page_loaded_xpath: xpath to indicate that page has loaded

        __ Description __
        sets up selenium bot
        """

        self.browser = browser.lower()
        self.timeout = timeout
        self.url = url
        self.page_loaded_xpath = page_loaded_xpath
        
        # 1 - setup browser
        print("==> setup_browser start")
        if(self.browser == "firefox"):
            self.driver = self.__setup_firefox()
        else:
            self.driver = self.__setup_chrome()
        self.driver.maximize_window()

        # 2 - load page
        self.driver.get(self.url)

        # 3- supprorting parameters for the future
        # waiter, to wait for contents to load. call the "waiter.until(function)" method
        self.WebDriverWaiter = WebDriverWait(self.driver, self.timeout)
        self.save_period = save_period
        
        print("==> setup_browser end\n")

    def __setup_firefox(self):
        """
        __ Description __
        open up a firefox driver

        __ Returns __
        driver handle
        """

        # 1 - create a browser instance
        print("  > Starting new Firefox server")
        browser = webdriver.Firefox(
            executable_path='./geckodriver')

        return browser

    def __setup_chrome(self):
        """
        __ Description __
        open up a chrome driver

        __ Returns __
        driver handle
        """

        # 1 - set capabilities
        capabilities = {'chromeOptions':
                        {
                            'useAutomationExtension': False,
                            'args': ['--disable-extensions']}
                        }

        # 2 - set options for chrome
        chrome_options = Options()
        chrome_options.add_argument("--use-fake-ui-for-media-stream") # <-------------------- bypasses "allow google to use microphone popup box"
        chrome_options.add_experimental_option("prefs", {
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        # 3 - create a browser instance with defined options
        print("  > Starting new Chrome server")
        browser = webdriver.Chrome(executable_path="./chromedriver",
                                   desired_capabilities=capabilities,
                                   options=chrome_options)
        return browser
    
    def supp_extract_html(self, soup, html_tags_array):
        """
        __ Parameters __
        [soup] soup: html to extract from formatted with BeautifulSoup
        [arr] html_tags_array: array of the form
        
                                    [["div", {"role": "option"}], 
                                    ["div", {"aria-label": "Reading Pane"}], 
                                    ...]

        which specifies the name ("div", "span") and attributes ({"id": ["test1", "test2"], "aria-label": "pane"})
        from outer to inner tags, iteratively going down specificity levels

        __ Description __
        iterates through the supplied "soup" html looking for tags whose parrents match all the supplied "html_tags"

        __ Return __
        [htmltag1, htmltag2, htmltag3]: array of html tags that fit the search requirement
        """

        structure_depth  = len(html_tags_array)
        debug_counter = 0

        try:
            if(structure_depth != 1):
                # 1 - unpack the first structure
                current_structure = soup.find(
                    html_tags_array[0][0], attrs=html_tags_array[0][1])

                # 2 - unpack further structures until we get to the last one
                for i in range(1, structure_depth - 1):
                    debug_counter += 1
                    name = html_tags_array[i][0]
                    attrs = html_tags_array[i][1]
                    current_structure = current_structure.find(names, attrs=attrs)
                # 3 - extract all matches from the lowest structure
                current_structure = current_structure.find_all(
                    html_tags_array[-1][0], attrs=html_tags_array[-1][1])
            else:
                # 1 - in the special case that only one structure is specified
                current_structure = soup.find_all(
                    html_tags_array[0][0], attrs=html_tags_array[0][1])

            return current_structure
            
        except AttributeError:
            # Error when an entry is missing
            print("The page does not have the html element:\n\t[%s, %s]"
                  % (html_tags_array[debug_counter], html_tags_array[debug_counter]))
            
            return ""
        
    def supp_extract_text(self, soup, html_tags_array):
        """
        __ Parameters __
        [soup] soup: html to extract from formatted with BeautifulSoup
        html_tags_array: array of the form
        
        [["div", {"role": "option"}], 
        ["div", {"aria-label": "Reading Pane"}], 
        ...]

        which specifies the name ("div", "span") and attributes ({"id": ["test1", "test2"], "aria-label": "pane"})
        from outer to inner tags, iteratively going down specificity levels

        __ Description __
        iterates through the supplied "soup" html looking for tags whose parrents match all the supplied "html_tags"
        then a text array is extracted from this tag

        __ Return __
        [array] matching text in the innter structure
        """

        html_structure = self.supp_extract_html(soup, html_tags_array)
        
        # 1 - take all of the tags found and extract text
        array_to_return = [i.get_text().strip() for i in html_structure]
        
        return array_to_return
        
    def supp_write_to_element(self, element_xpath, fill_value):
        """
        __ Parameters __
        [str] element_xpath: element to look for e.g. //div[@id=|password|]
        [str] fill_value: what to write in the form

        __ Description __
        enters the "fill_value" into the chosen "element"
        """
        self.supp_wait_for_xpath(element_xpath, "input_box")
        
        element = self.driver.find_element_by_xpath(element_xpath)
        if(element):
            element.send_keys(fill_value)
        else:
            print("**> Element with xpath %s does not exist" %element_xpath)

        return True

    def supp_wait_for_xpath(self, xpath, description):
        """
        __ Parameters __
        [str] xpath: xpath to wait for
        [str] description: the object that is trying to be located. will be printed to console. 
                           "NA" to skip

        __ Description __
        pauses the browser until "xpath" is loaded on the page
        """

        if(description != "NA"):
            print("  > Waiting for \"%s\" to load" %(description))
            
        self.WebDriverWaiter.until(
            EC.presence_of_element_located(
                (By.XPATH, xpath)), 
            message="Did not find %s within the timeout time you set of %i"%(xpath, self.timeout)
        )
        
    def supp_click(self, xpath):
        """
        __ Parameters __
        [str] xpath: xpath of object to click

        __ Description __
        clicks the element
        """
        self.driver.find_element_by_xpath(xpath).click()
        
    def supp_load_soup(self):
        """
        Loads up a soup of all the html on the visible page
        __ Returns __
        Soup Object to search
        """
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        return soup
    
    def refresh(self):
        """
        __ Description __
        Reload page
        """

        self.driver.get(self.url)
        self.supp_wait_for_xpath(self.page_loaded_xpath, "main page")

    def reset(self):
        """
        __ Description __
        clears the pandas_out array to the initial value
        """

        self.pandas_out = pd.DataFrame(columns=self.entry_list)

    def save_data(self, file_name="pandas_out", ext="csv"):
        """
        __ Parameters __
        [str] file_name: the file to save to. provide .pkl or .csv extension
        
        __ Description __
        Saves data accumulated in "pandas_out" to output file
        """
        
        # 1 - create output directory
        if not os.path.exists("./output"):
            os.mkdir("output")

        # 2 - cut any extensions that were given by accident
        file_name = file_name.split(".")[0]
        file_name = "./output/%s" % (file_name)
        
        if(ext == "pkl"):
            self.pandas_out.to_pickle("%s.pkl" % file_name)
        else:
            self.pandas_out.to_csv("%s.csv" % file_name)

    def date_from_string(self, date_string):
        """
        __ Parameters __
        [str] date_string: either day of week or "18 May 2019"

        __ Description __
        convert to an array numerical date values. if a weekday was supplied, find the nearest previous date

        __ Return __
        [year, month, day] date: array of the date
        """

        weekday_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
                        "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]


        if (date_string in weekday_list):
            # 1 - set loop parameters
            date = datetime.date.today()
            date_shift = datetime.timedelta(days = 1)
            date_found = False

            # 2 - decrease date, until the weekday_list match
            while(not date_found):
                date = date - date_shift
                day_of_the_week_long = weekday_list[date.weekday()]
                day_of_the_week_short = weekday_list[date.weekday() + 7]
                if((day_of_the_week_long == date_string) or (day_of_the_week_short == date_string)):
                    date_found = True
        else:
            date = datetime.datetime.strptime(date_string, '%d %B %Y')

        date_array = [date.year, date.month, date.day]
        return date_array

    def string_from_date(self, date_array):
        """
        __ Parameters __
        [year, month, day] date: array of the date

        __ Description __
        converts the array to string representation "18 May 2019"

        __ Return __
        [str] date_string
        """

        date = datetime.datetime(date_array[0], date_array[1], date_array[2])
        return date.strftime("%d %B %Y")    

    def datetime_from_date(self, date_array):
        """
        __ Parameters __
        [year, month, day] date: array of the date

        __ Description __
        converts the array to a datetime object

        __ Return __
        [datetime] datetimeObject
        """
        return datetime.datetime(date_array[0], date_array[1], date_array[2])

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 4.05 µs


In [5]:
%time
class audioTranslator_bot(selenium_bot):
    """bot to translate audio files on google
    """
    
    def __init__(self, browser, timeout, save_period=5, url="https://translate.google.com/",
                 page_loaded_xpath="//div[@class='source-footer-wrap source-or-target-footer']", language="english"):
        """
        __ Parameters __
        [str] browser: "Firefox" or "Chrome"
        [float] timeout: how long to wait for tiemouts on the page
        [int] save_period: during scraping of email, how often to save an output file. default every 5 emails
        [str] url: of page to visit
        [str] page_loaded_xpath: xpath to indicate that page has loaded
        __ Description __
        initialisation of web driver and skype variables
        """
        
        try:
            # 1 - setup driver
            selenium_bot.__init__(self, browser, timeout, int(save_period), url, page_loaded_xpath)

            # 2 - setup the environment
            self.__setup(language)

        except TimeoutException:
            print(
                "**> Page failed to fully load. Increase timeout (currently %.1fs)" % (self.timeout))
            return
        
    def __setup(self, language):
        """
        __ Description __
        - select language
        - check google can access microphone
        """
        ######################################## XPATHS
        recordButton_xp = "//div[@data-tooltip='Turn on voice input']/span[@class='jfk-button-img']"
        ########################################

        # 1 - set language
        print("\n==> __setup start")
        self.set_language(language)
        
        # # 2 - allow google to access audio
        # self.supp_click(recordButton_xp)
        # print("\n**> Allow google.com to access the microphone")
        
        # 3 - change to soundflower
        print("✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠")
        print("  > Settings -> Sound -> Output: Soundflower (2ch)")
        print("  > Settings -> Sound -> Input: Soundflower (2ch)")
        print("✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠✠")
        print("**> SET THESE SETTINGS AND PRESS ENTER <**")
        input()
            
        # 4 - pandas dataframe
        self.entry_list = [ "Language", "AudioFile", "Transcript"]
        self.pandas_out = pd.DataFrame(columns=self.entry_list)

        print("==> __setup end")

    def set_language(self, language="english"):
        """
        __ Parameters __
        ["english" or "chinese"]: language

        __ Description __
        Refresh page and set language
        """
        ########################################
        fromChinese_xp1 = "//div[@class='sl-wrap']/div[@aria-label='More']"
        fromChinese_xp2 = "//div[@class='language_list_item_icon sl_list_zh-CN_checkmark']"
        toChinese_xp1 = "//div[@class='tl-wrap']/div[@aria-label='More']"
        toChinese_xp2 = "//div[@class='language_list_item_icon tl_list_zh-CN_checkmark']"
        
        fromEnglish_xp1 = "//div[@class='sl-wrap']/div[@aria-label='More']"
        fromEnglish_xp2 = "//div[@class='language_list_item_icon sl_list_en_checkmark']"
        toEnglish_xp1 = "//div[@class='tl-wrap']/div[@aria-label='More']"
        toEnglish_xp2 = "//div[@class='language_list_item_icon tl_list_en_checkmark']"
        ########################################

        print("  > set_language start")
        self.language = language.lower()
        
        # 1 - refresh page
        self.driver.get(self.url)
        self.supp_wait_for_xpath(self.page_loaded_xpath, "main page")

        # 2 - set to translate
        print("  > Setting language to %s" % (self.language.upper()))
        
        if(self.language == "english"):
            self.supp_click(fromEnglish_xp1)
            time.sleep(0.1)     # <-------------------- skipping without delay
            self.supp_click(fromEnglish_xp2)
            self.supp_click(toEnglish_xp1)
            self.supp_click(toEnglish_xp2)
        else:
            self.supp_click(fromChinese_xp1)
            time.sleep(0.1)
            self.supp_click(fromChinese_xp2)
            self.supp_click(toChinese_xp1)
            self.supp_click(toChinese_xp2)

        print("  > set_language end")

    def play_audio(self, audio="test.mp3"):
        """
        __ Parameters __
        [string] audio: name of the .mp3 file to play. It should be in the /audio folder

        __ Description __
        after checking that the audio file exists, play the audio file in vlc and quit

        __ Return __
        True: if file was played
        False: if there was an error
        """

        # 1 - build full path
        audio_path = "audio/" + audio

        # 2 - check if file exists
        if(os.path.isfile(audio_path)):
            print("\n  > Playing audio file:\t\"%s\"" %(audio_path))

            # 3 - run shell command to open file with vlc and exit
            cmd = "/usr/local/bin/vlc --no-loop --no-repeat --play-and-exit %s" % (audio_path)
            args = shlex.split(cmd)
            p = subprocess.Popen(args, stdout=PIPE)
            p.wait()
            print("  > File played")
            return True
        else:
            print("\n**> Audio file \"%s\" does not exist" %(audio_path))
            return False        

    def transcript_audio(self, language, audio):
        """
        __ Parameters __
        ["english" or "chinese"] language: language to translate
        [str] audio: name of audio file located in the /audio folder

        __ Description __
        feeds the audio to google translate and writes output to a DataFrame and file

        __ Return __
        [ DataFrame ] of the [language, audioName, transcript]
        """

        ########################################
        recordButton_xp = "//div[@data-tooltip='Turn on voice input']/span[@class='jfk-button-img']"
        translation_xp = "//span[@class='tlid-translation translation']"
        english_transcript_tags = [["span", "tlid-translation translation"]]
        chinese_transcript_tags = [["div",
                          {"class": "tlid-result-transliteration-container result-transliteration-container transliteration-container"}],
                         ["div",
                          {"class": "tlid-transliteration-content transliteration-content full"}]]
        ########################################

        print("\n==> transcript_audio start\t[%s]" % (audio))
        # 3 - set language
        self.set_language(language)

        # 4 - play the audio file and wait for translation
        self.supp_click(recordButton_xp)
        self.play_audio(audio)

        
        # 5 - wait for translation to load
        try:
            self.supp_wait_for_xpath(translation_xp, "translation box")
            time.sleep(1)

            # 6 - grab the transcript and tidy it up
            if(self.language == "english"):
                transcript = running.supp_extract_text(running.supp_load_soup(), english_transcript_tags)
            else:
                transcript = running.supp_extract_text(running.supp_load_soup(), chinese_transcript_tags)

            transcript = " ".join(transcript)
            transcript = self.convert_to_abc(transcript)
            print("  > Extracted: \"%s\"" % (transcript))

        except TimeoutException:
            # if translation is empty, the audio was not read in
            print("\n**> No Audio:\tTurn volume up or look at file %s" % (audio))
            transcript = "-no-audio-"

        # 6 - add to DataFrame
        to_write = {"Language": self.language,
                    "AudioFile": audio,
                    "Transcript": transcript}
        self.pandas_out = self.pandas_out.append(to_write, ignore_index=True)
        
        # 7 - save
        self.save_data(file_name="parsed_audio")

        print("==> transcript_audio end")
        return self.pandas_out
    
    def convert_to_abc(self, text_to_convert):
        """
        __ Parameters __
        [str] text_to_convert

        __ Description __
        converts the supplied list by
        - removing all accents
        - lowercasing

        __ Return __
        [str] converted
        """
        
        text_to_convert = text_to_convert.split(" ")
        converted_list = []

        for i in text_to_convert:

            mg = re.search("(\w)", i)
            if(mg):
                # lower case
                i = i.lower()

                # remove accents
                i = unidecode.unidecode(i)

                converted_list.append(i)

        return " ".join(converted_list)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [None]:
########################################
########################################
timeout=10                      # seconds to wait for page elements to load before quitting
browser="firefox"                # firefox of chrome
########################################
########################################
running = audioTranslator_bot(browser, timeout)

==> setup_browser start
  > Starting new Firefox server


In [9]:
running.transcript_audio("english", "test.mp3")
running.transcript_audio("chinese", "chinese.mp3")
running.transcript_audio("english", "test.mp3")

NameError: name 'running' is not defined