# Example implementations
Below are the two realizations of selenium for scraping **Outlook** and **Skype**. The step by step structure is explained in the <a href="./adv04_selenium_skypeOutlook.pdf" ><b>PDF Report<b></a>. This is just a demonstration of what full fledging scraping looks like

<img src="images_inkscape/structure_selenium.png" style="width: 500px;">

***
#Selenium Class and imports

In [3]:
import time

# create a browser instance
from selenium import webdriver

# emulate keyboard inputs
from selenium.webdriver.common.keys import Keys

# creatinga single browser instance
import selenium.webdriver.firefox.service as service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotInteractableException

# WebDriverWait and EC to allow waiting for element to load on page
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# module to search for elements using xpaths
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions

# exception handling
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# quick clicking and scrolling
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# searching of html with "find()"
from bs4 import BeautifulSoup
import pandas as pd
import sys
import math
import os                       # file saving
import re
import datetime

In [4]:
class selenium_bot():
    """
    Interactable bot, that parses outlook files
    """
    def __init__(self, browser, timeout, save_period, url, succesful_login_xpath):
        """
        __ Parameters __
        [str] browser: "Firefox" or "Chrome"
        [float] timeout: how long to wait for responses from webpage
        [save_period] float: time in seconds to create backup of parsed data
        [str] url: url bot starts off at
        [str] succesful_login_xpath: xpath to indicate that page has loaded

        __ Description __
        sets up selenium bot
        """

        self.browser = browser.lower()
        self.timeout = timeout
        self.url = url
        self.succesful_login_xpath = succesful_login_xpath
        
        # 1 - setup browser
        self.driver = self.__setup_chrome()
        self.driver.maximize_window()

        # 2 - load page
        self.driver.get(self.url)

        # 3- supprorting parameters for the future
        # waiter, to wait for contents to load. call the "waiter.until(function)" method
        self.WebDriverWaiter = WebDriverWait(self.driver, self.timeout)
        self.save_period = save_period
        
        print("==> setup_browser end\n")

    def __setup_firefox(self):
        """
        __ Description __
        open up a firefox driver

        __ Returns __
        driver handle
        """

        # 1 - create a browser instance
        print("  > Starting new Firefox server")
        browser = webdriver.Firefox(
            executable_path='./geckodriver')

        return browser

    def __setup_chrome(self):
        """
        __ Description __
        open up a chrome driver

        __ Returns __
        driver handle
        """

        # 1 - set capabilities
        capabilities = {'chromeOptions':
                        {
                            'useAutomationExtension': False,
                            'args': ['--disable-extensions']}
                        }

        # 2 - set options for chrome
        chrome_options = Options()
        chrome_options.add_experimental_option("prefs", {
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        # 3 - create a browser instance with defined options
        print("  > Starting new Chrome server")
        browser = webdriver.Chrome(executable_path="./chromedriver",
                                   desired_capabilities=capabilities,
                                   options=chrome_options)
        return browser
    
    def supp_extract_html(self, soup, html_tags_array):
        """
        __ Parameters __
        [soup] soup: html to extract from formatted with BeautifulSoup
        [arr] html_tags_array: array of the form
        
                                    [["div", {"role": "option"}], 
                                    ["div", {"aria-label": "Reading Pane"}], 
                                    ...]

        which specifies the name ("div", "span") and attributes ({"id": ["test1", "test2"], "aria-label": "pane"})
        from outer to inner tags, iteratively going down specificity levels

        __ Description __
        iterates through the supplied "soup" html looking for tags whose parrents match all the supplied "html_tags"

        __ Return __
        [htmltag1, htmltag2, htmltag3]: array of html tags that fit the search requirement
        """

        structure_depth  = len(html_tags_array)
        debug_counter = 0

        try:
            if(structure_depth != 1):
                # 1 - unpack the first structure
                current_structure = soup.find(
                    html_tags_array[0][0], attrs=html_tags_array[0][1])

                # 2 - unpack further structures until we get to the last one
                for i in range(1, structure_depth - 1):
                    debug_counter += 1
                    name = html_tags_array[i][0]
                    attrs = html_tags_array[i][1]
                    current_structure = current_structure.find(names, attrs=attrs)
                # 3 - extract all matches from the lowest structure
                current_structure = current_structure.find_all(
                    html_tags_array[-1][0], attrs=html_tags_array[-1][1])
            else:
                # 1 - in the special case that only one structure is specified
                current_structure = soup.find_all(
                    html_tags_array[0][0], attrs=html_tags_array[0][1])

            return current_structure
            
        except AttributeError:
            # Error when an entry is missing
            print("The page does not have the html element:\n\t[%s, %s]"
                  % (html_tags_array[debug_counter], html_tags_array[debug_counter]))
            
            return ""
        
    def supp_extract_text(self, soup, html_tags_array):
        """
        __ Parameters __
        [soup] soup: html to extract from formatted with BeautifulSoup
        html_tags_array: array of the form
        
        [["div", {"role": "option"}], 
        ["div", {"aria-label": "Reading Pane"}], 
        ...]

        which specifies the name ("div", "span") and attributes ({"id": ["test1", "test2"], "aria-label": "pane"})
        from outer to inner tags, iteratively going down specificity levels

        __ Description __
        iterates through the supplied "soup" html looking for tags whose parrents match all the supplied "html_tags"
        then a text array is extracted from this tag

        __ Return __
        [array] matching text in the innter structure
        """

        html_structure = self.supp_extract_html(soup, html_tags_array)
        
        # 1 - take all of the tags found and extract text
        array_to_return = [i.get_text().strip() for i in html_structure]
        
        return array_to_return
        
    def supp_write_to_element(self, element_xpath, fill_value):
        """
        __ Parameters __
        [str] element_xpath: element to look for e.g. //div[@id=|password|]
        [str] fill_value: what to write in the form

        __ Description __
        enters the "fill_value" into the chosen "element"
        """
        self.supp_wait_for_xpath(element_xpath, "input_box")
        
        element = self.driver.find_element_by_xpath(element_xpath)
        if(element):
            element.send_keys(fill_value)
        else:
            print("**> Element with xpath %s does not exist" %element_xpath)

        return True

    def supp_wait_for_xpath(self, xpath, description):
        """
        __ Parameters __
        [str] xpath: xpath to wait for
        [str] description: the object that is trying to be located. will be printed to console. 
                           "NA" to skip

        __ Description __
        pauses the browser until "xpath" is loaded on the page
        """

        if(description != "NA"):
            print("  > Waiting for \"%s\" to load" %(description))
            
        self.WebDriverWaiter.until(
            EC.presence_of_element_located(
                (By.XPATH, xpath)), 
            message="Did not find %s within the timeout time you set of %i"%(xpath, self.timeout)
        )
        
    def supp_click(self, xpath):
        """
        __ Parameters __
        [str] xpath: xpath of object to click

        __ Description __
        clicks the element
        """
        print(self.driver.find_element_by_xpath(xpath))
        self.driver.find_element_by_xpath(xpath).click()
        
    def supp_load_soup(self):
        """
        Loads up a soup of all the html on the visible page
        __ Returns __
        Soup Object to search
        """
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        return soup
    
    def refresh(self):
        """
        __ Description __
        Resets variables of bot class and reload page
        """

        self.driver.get(self.url)
        self.supp_wait_for_xpath(self.succesful_login_xpath, "main page")

    def reset(self):
        """
        __ Description __
        clears the pandas_out array to the initial value
        """

        self.pandas_scraped = pd.DataFrame(columns=self.pandas_columns)

    def save_data(self, file_name="pandas_out", ext="csv"):
        """
        __ Parameters __
        [str] file_name: the file to save to. provide .pkl or .csv extension
        
        __ Description __
        Saves data accumulated in "pandas_out" to output file
        """
        
        # 1 - create output directory
        if not os.path.exists("./output"):
            os.mkdir("output")

        # 2 - cut any extensions that were given by accident
        file_name = file_name.split(".")[0]
        file_name = "./output/%s" % (file_name)
        
        if(ext == "pkl"):
            self.pandas_scraped.to_pickle("%s.pkl" % file_name)
        else:
            self.pandas_scraped.to_csv("%s.csv" % file_name)

    def date_from_string(self, date_string):
        """
        __ Parameters __
        [str] date_string: either day of week or "18 May 2019"

        __ Description __
        convert to an array numerical date values. if a weekday was supplied, find the nearest previous date

        __ Return __
        [year, month, day] date: array of the date
        """

        weekday_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
                        "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]


        if (date_string in weekday_list):
            # 1 - set loop parameters
            date = datetime.date.today()
            date_shift = datetime.timedelta(days = 1)
            date_found = False

            # 2 - decrease date, until the weekday_list match
            while(not date_found):
                date = date - date_shift
                day_of_the_week_long = weekday_list[date.weekday()]
                day_of_the_week_short = weekday_list[date.weekday() + 7]
                if((day_of_the_week_long == date_string) or (day_of_the_week_short == date_string)):
                    date_found = True
        else:
            date = datetime.datetime.strptime(date_string, '%d %B %Y')

        date_array = [date.year, date.month, date.day]
        return date_array

    def string_from_date(self, date_array):
        """
        __ Parameters __
        [year, month, day] date: array of the date

        __ Description __
        converts the array to string representation "18 May 2019"

        __ Return __
        [str] date_string
        """

        date = datetime.datetime(date_array[0], date_array[1], date_array[2])
        return date.strftime("%d %B %Y")    

    def datetime_from_date(self, date_array):
        """
        __ Parameters __
        [year, month, day] date: array of the date

        __ Description __
        converts the array to a datetime object

        __ Return __
        [datetime] datetimeObject
        """
        return datetime.datetime(date_array[0], date_array[1], date_array[2])

# 📧 Outlook class

In [5]:
class outlook_bot(selenium_bot):
    """bot to extract email content from outlook
    """

    def __init__(self, browser, timeout, save_period=5, url="https://mail.sinobestech.com.hk/owa", succesful_login_xpath = "//div[@class = 'flex flexcolumn']"):
        """
        __ Parameters __
        [str] browser: "Firefox" or "Chrome"
        [float] timeout: how long to wait for tiemouts on the page
        [int] save_period: during scraping of email, how often to save an output file. default every 5 emails

        __ Description __
        initialisation of web driver and outlook variables
        """

        # 1 - setup driver
        selenium_bot.__init__(self, browser, timeout, int(save_period), url, succesful_login_xpath)

        # 2 - setup outlook environment
        self.__setup_outlook()
        
    def __setup_outlook(self):
        """
        __ Description __
        Sets up supporting objects for outlook

        self.pandas_scraped: ouput dataframe with keys:
        [ "From", "Date", "Subject", "Content_Conversation", "Content_Forwarded"]
        """

        # 1 - pandas dataframe
        self.pandas_columns = [ "From", "Date", "Subject", "Content_Conversation", "Content_Forwarded"]
        self.pandas_scraped = pd.DataFrame(columns=self.pandas_columns)

        self.scrape_filters_set = False
        
        # 2 - debugging
        self.entry_missing_array = [0] * 5
        self.email_current = -1
        self.email_total = -1

    def outlook_login(self, outlook_id, password):
        """
        __ Parameters __
        [str] outlook_id: email to log on with
        [str] password:   password

        __ Description __
        logs into outlook
        """
        print("==> outlook_login start")
        
        # 1 - access the outlook page
        self.supp_wait_for_xpath("//input[@id='username']", "user_name_input_field")

        # 2 - locate credential fields and fill them in
        self.supp_write_to_element("//input[@id='username']", outlook_id)
        self.supp_write_to_element("//input[@id='password']", password)
        print(type(self.driver.find_element_by_xpath(
            "//div[@onclick='clkLgn()']")))
        self.driver.find_element_by_xpath(
            "//div[@onclick='clkLgn()']").click()

        # 3 - ensure that login is succesfull and wait for emails to load
        self.supp_wait_for_xpath(self.succesful_login_xpath, "main_page")

        print("==> outlook_login end\n")

    def outlook_scrape_setup(self, date_min, date_max, only_unread=False, scan_min=0, scan_max=9999):
        """
        __ Parameters __
        [year, month, day] date_min/date_max:   date range to scrape
        [bool] only_unread:                     whether only unread emails should be scraped
        [int] scan_min/max:                     email range to scrape

        __ Description __
        sets values in preparation for scraping outlook
        """

        print("==> outlook_scrape_setup start")

        self.criteria = {}

        # 1 - set dates if supplied
        if(date_min):
            self.criteria['date_min'] = date_min
            print("  > Minimum date:\t",
                  datetime.datetime(date_min[0], date_min[1], date_min[2]).strftime("%A, %d %b %Y"))
        else:
            self.criteria['date_min'] = [1, 1, 1]
            print("  > No minimum date")

        if(date_max):
            self.criteria['date_max'] = date_max
            print("  > Maximal date:\t",
                  datetime.datetime(date_max[0], date_max[1], date_max[2]).strftime("%A, %d %b %Y"))
        else:
            self.criteria['date_max'] = [8888, 1, 1]  # highest possible date
            print("  > No maximal date")

        # 2 - read/unread
        self.criteria['only_unread'] = only_unread
        if(only_unread):
            print("  > Scraping only unread emails")

        # 3 -  min
        self.criteria['scan_min'] = scan_min
        self.criteria['scan_max'] = scan_max
        print(f"  > Email index start:\t{scan_min}\n  > Email index end:\t{scan_max}")

        self.scrape_filters_set = True

        print("==> outlook_scrape_setup end\n")

        
    def outlook_scrape(self, file_name="pandas_out", ext="csv"):
        """
        __ Parameters __
        [str] filename:         to which to save dataframe
        [str] ext:              format to save as - pkl or csv

        __ Description __
        Iterates through the emails and parses out information into pandas dataframe
        """
        ########################################xpaths
        inbox_mail_L1_xp = "//div[@class = 'flex flexcolumn']"
        inbox_mail_L2_xp = "//div[@role ='option']"
        inbox_mail_soup = [["div", {"class": "flex flexcolumn"}],
                           ["div", {"role": "option"}]]
        ########################################
        
        print("==> outlook_scrape start")
            
        # 0 - prepare variable
        self.refresh()
        self.supp_wait_for_xpath(self.succesful_login_xpath, "NA")

        if(not self.scrape_filters_set):
            self.outlook_scrape_setup(None, None)
        inbox_cycle = True
        email_loop_no = 0
        email_base_index = 0            # base index is required to stitch email numbers across different loops
        uniqueID_already_scraped = set()

        while(inbox_cycle):

            # 1 - [XPATH] extract unique tag and webelement of each email
            visible_webElements = self.driver.find_element_by_xpath(inbox_mail_L1_xp).find_elements_by_xpath(inbox_mail_L2_xp)
            visible_uniqueID = [i.text for i in visible_webElements]

            # 2 - [SOUP] extract metadata of visible mail
            soup = self.supp_load_soup()
            visible_metadataRaw = self.supp_extract_html(soup, inbox_mail_soup)
            visible_metadata = []
            for i in visible_metadataRaw:
                visible_metadata.append({"date": self.outlook_inbox_date(i),
                                         "unread": self.outlook_inbox_unread(i)})

            # 3 - only iterate through unscraped mail i.e. uniqueID is not in "uniqueID_already_scraped" set
            emails_to_scrape = []
            i = 0
            for email_webElement, email_uniqueID, email_metadata in zip(visible_webElements, visible_uniqueID, visible_metadata):
                if(email_uniqueID not in uniqueID_already_scraped):
                    
                    # 3a - add the email number
                    email_metadata["email_no"] = email_base_index + i
                    i += 1
                    # 3b - store email id and email_metadata for further extraction
                    emails_to_scrape.append({"email_webElement": email_webElement,
                                            "email_metadata": email_metadata})
                    # 3c - store the unqiue tag to prevent scraping it in the future
                    uniqueID_already_scraped.add(email_uniqueID)


            print("  [Loop No.%i]:\t%i unique emails found in inbox so far" % (email_loop_no, len(uniqueID_already_scraped)))

            # 4 - iterate through only the new emails
            for i, email in enumerate(emails_to_scrape):
                
                # a - check that email_metadata fulfills criteria
                criteria_satisfied = self.criteria_check(email['email_metadata'],
                                                         self.criteria)

                if(criteria_satisfied):
                    print(self.outlook_scrape_print_progress(email['email_metadata']))
                    email_content = self.outlook_scrape_email(email['email_webElement'])
                    self.pandas_scraped = self.pandas_scraped.append(email_content, ignore_index=True)
                    
                    # b - write to file periodically during intense data writting
                    if(((email['email_metadata']['email_no'
                    ] - self.criteria['scan_min'] + 1) % self.save_period) == 0):
                        self.save_data(file_name, ext)
                    
            # 5 - click on last mail (to scroll down)
            self.outlook_scrape_email(emails_to_scrape[-1]['email_webElement'])
            
            # a - set variables for next loop
            email_loop_no += 1
            email_base_index = len(uniqueID_already_scraped)

            # b - check against max emails scraped
            if(email_base_index > self.criteria['scan_max']):
                # if we have scraped all the emails, stop
                inbox_cycle = False

            # c - check against no scrolling (same emails displayed)
            visible_idx_new = self.driver.find_element_by_xpath(inbox_mail_L1_xp).find_elements_by_xpath(inbox_mail_L2_xp)
            if (visible_webElements == visible_idx_new):
                inbox_cycle = False

        # 5 - save data and exit
        print("==> Scraped %i emails that fit the criteria" %(len(uniqueID_already_scraped)))        
        self.save_data(file_name, ext)        
        print("==> outlook_scrape end")

    def outlook_scrape_print_progress(self, email_metadata):
        """
        __ Parameters __
        [dict] email_metadata:          {"date":        [year, month, day] of email,
                                         "unread":      [bool] read status of email,
                                         "email_no":    [int] top email is 0}

        __ Description __
        generates string to print to console about the emial currently being extracted

        __ Returns __
        [str] string to print to console describing email being extracted
        """
        email_no = email_metadata['email_no']
        date = email_metadata['date']
        unread = email_metadata['unread']

        # 1 - read undread
        string_read = "\t[Read]"
        if(unread):
            string_read = "\t[Unread]"
            
        # 2 - date
        string_date = "\t[" + datetime.datetime(date[0], date[1], date[2]).strftime("%A, %d %b %Y") + "]"

        string_to_print = f"  > Scraping Email {email_no}" + string_read + string_date

        return string_to_print
        
    def outlook_scrape_email(self, email_webElement):
        """
        __ Parameters __
        [web_element] email_webElement:            element found with xPath  "self.driver.find_element_by_xpath(...)"

        __ Description __
        extacts data from the given email (passed as a web_element)
        
        __ Return __
        [dict]          {"From": e_from,
                        "Date": e_date,
                        "Subject": e_subject,
                        "Content_Conversation": e_content_conversation,
                        "Content_Forwarded": e_content_forwarded}
        """
        # 1 - load up the email_webElement and wait for for load
        try:
            email_webElement.click()
            self.supp_wait_for_xpath("//div[@id = 'Item.MessageUniqueBody']", "NA")
            self.WebDriverWaiter.until(wait_for_content_forwarded())

        except TimeoutException:
            print(
                "**> Email failed to load. Increase timeout (currently %.1fs)" % (self.timeout))
            return

        # 2 - extract html on the page. soup is the chad way to search this html
        soup = self.supp_load_soup()

        # a - subject
        self.entry_current = 0
        e_subject = "".join(self.supp_extract_text(soup,
                                     [["div", {"aria-label": "Reading Pane"} ],
                                      ["div", {'role': "heading", "aria-level": "2"}]]))
        
        # b - from
        self.entry_current = 1
        e_from = "".join(self.supp_extract_text(soup,
                                        [["div", {"aria-label": "Persona card"} ]]))
        match_groups = re.search("([^<]*)(.*)?", e_from) # remove email_webElement <ilya.antonv....>
        e_from = match_groups.group(1).strip()

        # c - date and time
        self.entry_current = 2
        e_date = "".join(self.supp_extract_text(soup,
                                         [["div", {"class": "_rp_f8"}],
                                          ["span", {"class": "allowTextSelection"} ]]))

        # d - email_webElement content
        self.entry_current = 3
        e_content_conversation = "".join(self.supp_extract_text(soup,
                                            [["div", {"aria-label": "Reading Pane"}],
                                             ["div", {"role": "document"}]]))

        self.entry_current = 4
        e_content_forwarded = "".join(self.supp_extract_text(soup,
                                         [["div", {"aria-label": "Reading Pane"}],
                                          ["div", {"id": "Conversation.FossilizedTextBody"}]]))

        #print("_______Subject________\n%s\n" %e_subject)
        #print("_______From________\n%s\n" %e_from)        
        #print("_______Date________\n%s\n" %e_date)        
        #print("_______Content_Conversation________\n%s\n" %e_content_conversation)        
        #print("_______Content_Forwarded________\n%s\n" %e_content_forwarded)
        
        # 6 - structure building and return
        email_entry = {"From": e_from,
                       "Date": e_date,
                       "Subject": e_subject,
                       "Content_Conversation": e_content_conversation,
                       "Content_Forwarded": e_content_forwarded}
        
        return email_entry
    
    def outlook_inbox_date(self, inbox_tag):
        """
        __ Parameters __
        [soup] inbox_tag: a html tag of an particular email in the inbox

        __ Description __
        extracts a date of the email in the inbox column by searching the "inbox_tag"

        __ Returns __
        [day, month, year]
        """

        ########################################
        date_attr = {"class": ["_lvv_M"]}
        ########################################

        # 1 - extract date tag
        date_tag = inbox_tag.find(attrs=date_attr)
        date_inbox = date_tag.get_text()

        # 2 - split date put by slashes. this will work for old entries
        date_return = date_inbox.split("/")
        date_return = date_return[::-1] # reverse order so that [year, month, day]
        
        if(len(date_return) != 3):
            # 3 - for email sent this week, the first string is the day of the week, which is converted to [year, month, day]
            weekday = date_inbox.split(" ")[0]
            date_return = self.date_from_string(weekday)

        # 3 - convert to int
        date_return = [int(i) for i in date_return]
            
        return date_return

    def outlook_inbox_unread(self, inbox_tag):
        """
        __ Parameters __
        [soup] inbox_tag: a html tag of an particular email in the inbox

        __ Description __
        checks if email unread or not

        __ Returns __
        True if unread. False otherwise
        """

        # 1 - read email have a "_lvv_y_" tag
        mg = re.search("\s_lvv_y\s", str(inbox_tag))

        # 2 - check if match was found, indicating that email has been read
        if(mg):
            return False
        else:
            return True
        
    def criteria_check(self, email_metadata, criteria):
        """
        __ Parameters __
        [dict] email_metadata:  {'email_no':   	[int]  email number as it appears in inbox
                                 'date':       	[year, month, day] of email
                                 'unread':     	[bool]  read status of email}

        [dict] criteria:        {'scan_min':   [int] range to scrape (0 for top email)
                                 'scan_max':
                                 'date_min':    [int] date range to scrape
                                 'data_max':
                                 'only_unread': [bool] whether to scrape only unread emails}

        __ Description __
        checks whether email should be scraped based off it's email_metadata

        __ Return __
        True/False
        """
        # 1 - extract criteria info
        date_min = criteria['date_min']
        date_max = criteria['date_max']
        date_min = datetime.datetime(date_min[0], date_min[1], date_min[2])
        date_max = datetime.datetime(date_max[0], date_max[1], date_max[2])

        # 2 - extract email_metadata
        unread = email_metadata['unread']
        email_no = email_metadata['email_no']
        date = email_metadata['date']
        date = datetime.datetime(date[0],date[1],date[2])    

        return_val = False

        ########################################
        # ⦿ Perform check
        ########################################
        # 1 - check that email is within indicies
        if((criteria['scan_min'] <= email_no) and (email_no <= criteria['scan_max'])):
            
            # 2 - check date
            if ((date_min <= date) and (date <= date_max)):
                
                # 3 - if scraping only unread, check unread status
                if(criteria['only_unread']):
                    if(unread):
                        return_val = True
                    else:
                        return_val = False
                else:
                    return_val = True

        return return_val

In [6]:
class wait_for_content_forwarded():
  """Checking that IF there is a forwarded message, that it has been loaded

  returns True if there is no forwarding message or it has been loaded

  To be used in the following way:
  formWebDriverWait.until(wait_for_content_forwarded())
  """

  def __call__(self, driver):
    """
    __ Parameters __
    driver: the WebDriverWait.until(xxx) calls method xxx with 'driver' as the first 
    argument.

    __ Description __
    ensure that any forwarded email is fully loaded

    a forwarded email has a non empty <div of forwarded email> in the following positon:

    <div aria-label='Reading-Pane> ..... 
        <div>....</div>
        <div>        <---------- div[2]
            <div>...</div>
            etc. etc.
            <div of forwarded email> <----------- NON empty when there is forwarding
            <div>...</div>        <---------- div[last()]
        </div>
    </div>

    __ Return __
    True: if forwarded email loaded
    False: if forwaded email has NOT loaded
    """
    
    # 1 - test if there is a forwarded section, by checking that the <div of forwarded email> is not empty
    try:
        driver.find_element_by_xpath("//div[@aria-label='Reading Pane']/div[2]/div[last()-1]/*")
    except NoSuchElementException:
        #  if no email is being forwarded then we don't have to wait
        return True

    # 2 - IF there is a forwarded email, wait for the body of the forwarded email to load
    try:
        driver.find_element_by_xpath("//div[@id='Conversation.FossilizedTextBody']/div[1]")
        return True
    except NoSuchElementException:
        #  treurn flase if the email has not loaded yet
        return False

In [8]:
########################################
########################################
outlook_id="programmer01@sbtgc.local"
password="3Zwl26EiY"
timeout=50                      # seconds to wait for page elements to load before quitting
browser="chrome"                # firefox of chrome
########################################
########################################
outlook_class = outlook_bot(browser, timeout)
outlook_class.outlook_login(outlook_id, password)

  > Starting new Chrome server
==> setup_browser end

==> outlook_login start
  > Waiting for "user_name_input_field" to load
  > Waiting for "input_box" to load
  > Waiting for "input_box" to load
<class 'selenium.webdriver.remote.webelement.WebElement'>
  > Waiting for "main_page" to load
==> outlook_login end



In [9]:
########################################
########################################
unread_only = False
date_min = None                 # either None or [2018, 1, 1]
date_max = [2019, 5, 25]        # either None or [2018, 1, 1]
# optional arguments (can call scrape_filters without them)
# top email has an id=0, second email id=1 etc.
id_min = 0                      # set 0 to include all emails
id_max = 1000                   # set to 1000 to include all emails
########################################
########################################
outlook_class.outlook_scrape_setup(date_min, date_max, unread_only, id_min, id_max)
outlook_class.outlook_scrape("outlook","csv")

==> outlook_scrape_setup start
  > No minimum date
  > Maximal date:	 Saturday, 25 May 2019
  > Email index start:	0
  > Email index end:	1000
==> outlook_scrape_setup end

==> outlook_scrape start
  > Waiting for "main page" to load
  [Loop No.0]:	25 unique emails found in inbox so far
  > Scraping Email 0	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 1	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 2	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 3	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 4	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 5	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 6	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 7	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 8	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 9	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 10	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 11	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 12	[Read]	[Friday, 12 Apr 2019]
  > Scraping Email 13	[Read]	[Friday, 12 Apr 2019]
  > Scra

In [None]:
# ☎ Skype class

In [11]:
class skype_bot(selenium_bot):
    """bot to extract email content from skpye
    """
    
    def __init__(self, browser, timeout, save_period=5,
                 url="https://web.skype.com",
                 succesful_login_xpath="//div[@role='group'][@aria-label='Conversations list']"):
        """
        __ Parameters __
        [str] browser:                  "Firefox" or "Chrome"
        [float] timeout:                how long to wait for tiemouts on the page
        [str] url:                      of page to visit
        [str] succesful_login_xpath:    xpath to indicate that page has loaded

        __ Description __
        initialisation of web driver and skype variables
        """
        
        try:
            # 1 - setup driver
            selenium_bot.__init__(self, browser, timeout, int(save_period), url, succesful_login_xpath)

            # 2 - setup skype environment
            self.__setup()

        except TimeoutException:
            print(
                "**> Page failed to fully load. Increase timeout (currently %.1fs)" % (self.timeout))
            return
        
    def __setup(self):
        """
        __ Description __
        Sets up supporting objects for skype

        self.pandas_scraped: ouput dataframe with keys:
        [ "From", "Date", "Message"]
        """

        # 1 - pandas dataframe
        self.pandas_columns = [ "From", "Date", "Message"]
        self.pandas_scraped = pd.DataFrame(columns=self.pandas_columns)

        self.scrape_filters_set = False

    def login(self, skype_id, password):
        """
        __ Parameters __
        [str] skype_id:         email to log on with
        [str] password:         password

        __ Description __
        logs into skype
        """
        print("==> login start")
        ########################################xpaths
        skype_login_box_xp = "//input[@type='email']"
        skype_password_box_xp = "//input[@type='password']"
        skype_submit_button_xp = "//input[@id='idSIButton9']"
        skype_got_it_xp = "//div[@data-text-as-pseudo-element='Got it!']"
        ########################################
        
        # 1 - wait for email box
        self.supp_write_to_element(skype_login_box_xp, skype_id)
        self.driver.find_element_by_xpath(skype_submit_button_xp).click()
        time.sleep(3)           # <---------------------------------------- need to wait for password box to come up
        # 2 - wait for password
        self.supp_write_to_element(skype_password_box_xp, password)
        self.driver.find_element_by_xpath(skype_submit_button_xp).submit()
        
        # 3 - remove popups after page has loaded
        self.supp_wait_for_xpath(self.succesful_login_xpath, "page")
        time.sleep(2)           # <---------------------------------------- wait for the popup box to come up
        try:
            self.driver.find_element_by_xpath(skype_got_it_xp).click()
        except:
            pass

        print("==> login end\n")

        
    def skype_scrollCheck_date(self, critetia, current_values):
        """
        __ Parameters __
        [dict] critetia:       {"chats_to_scrape":      [1D-int] starting from 0,             
                                "date_min":             [year,month,day] 
                                "date_max":             [year,month,day] 
                                "max_number_of_stalls": [int] before continuing}
        [dict] current_values  {"current_date":         [year,month,dayy],
                                "current_number_of_stalls": [int]}

        __ Description __
        checks if the filter-defined date has been reached, to determine if scrolling should continue

        __ Return __
        True if scrolling should continue
        False if it should be stopped
        """

        criteria_min_date = criteria['min_date']
        current_date = current_values['current_date']
        
        return_val = False

        if(current_values['current_number_of_stalls'] < criteria['max_number_of_stalls']):
            # 1 - continue scrolling if date is not defined
            if(current_date == "Undefined"):
                return True
        
            # 2 - compare the date reached so far in the chat with the filter date
            current_date = self.date_from_string(current_date) # convert the date from string to array
            current_date = datetime.datetime(current_date[0], current_date[1], current_date[2]) # initialie a datetime object
            criteria_min_date = datetime.datetime(criteria_min_date[0], criteria_min_date[1], criteria_min_date[2])
    
            if (criteria_min_date <= current_date):
                return_val = True
                
            else:
                print("\n  > Chat scrolled past the user-defined date: %s [now at %s]"
                      %(criteria_min_date.strftime("%d %B %Y"), current_date.strftime("%d %B %Y")))
                print("  > Stopping scrolling of chat")
        else:
            print("\n  > Chat stalled for the maximal user-defined number of scrolls: %i"
                  %(criteria['max_number_of_stalls']))
            print("  > Stopping scrolling of chat")

        return return_val
   
    def skype_scrape_setup(self, chats_to_scrape, max_number_of_stalls, date_min, date_max):
        """
        __ Parameters __
        [1D-int] chats_to_scrape:       chat indexes in the inbox to scrape, 0 for top chat
        [int] max_number_of_stalls:     during scrapping, scrolling leads to occasional pauses 
                                        while the earlier content is loaded. during this time 
                                        the chat page does not change. this specifies how many 
                                        times to wait when this happens before exiting
        [2018, 02, 01] date_min/max:    date range to scrape for each chat

        __ Description __
        Initializes the list "self.scrape_filters" used by the scraping functions
        """

        print("==> skype_scrape_setup start")
        self.criteria = {}

        # 1 - chats to scrape
        self.criteria['chats_to_scrape'] = chats_to_scrape

        # 2 - number of stalls
        print("  > Maximal number of stalls:\t %i" %(max_number_of_stalls))
        self.criteria['max_number_of_stalls'] = max_number_of_stalls
        
        # 3 - set date if supplied
        if(date_min):
            self.criteria['date_min'] = date_min
            print("  > Minimum date:\t\t",
                  datetime.datetime(date_min[0], date_min[1], date_min[2]).strftime("%A, %d %b %Y"))
        else:
            self.criteria['date_min'] = [1, 1, 1]  # lowest date
            print("  > No minimum date")

        if(date_max):
            self.criteria['date_max'] = date_max
            print("  > Maximal date:\t\t",
                  datetime.datetime(date_max[0], date_max[1], date_max[2]).strftime("%A, %d %b %Y"))
        else:
            self.criteria['date_min'] = [8888, 1, 1]  # highest possible date
            print("  > No maximal date")
        
        self.scrape_filters_set = True

        print("==> skype_scrape_setup end\n")

    def skype_scrape(self, ext="csv"):
        """
        __ Parameters __
        [str] ext: format to save as. pkl or csv

        __ Description __
        Iterates through chats in Skype, saving individual date ordered (old->new) chats to files
        """
        ######################################## XPATH of recent chats in the sidebar
        chats_in_sidebar_xp = "//div[@aria-label='Conversations list']/div/div[1]/div/div/*"
        chats_in_sidebar_sender_xp = "div/div/div[2]/div[1]/div"
        ########################################
        
        print("==> skype_scrape start")

        # 1 - set default scraping filters of scraping the full first chat
        if(not self.scrape_filters_set):
            self.skype_scrape_setup([0], 100, None, None)

        # 2 - extract all of the chats - go through the ones in the chats_to_scrape list
        chats_to_scrape = self.criteria['chats_to_scrape']
        chats = self.driver.find_elements_by_xpath(chats_in_sidebar_xp)
        
        for i, chat in enumerate(chats):
            if(i in chats_to_scrape):
                chats_to_scrape.remove(i)
                
                # 3 - get the sender
                sender = chats[i].find_element_by_xpath(chats_in_sidebar_sender_xp).get_attribute("data-text-as-pseudo-element")
                # replace dots, @ and spaces with underscores
                sender = sender.lower()
                sender = re.sub("(\.|@|\s)", "_", sender)
                self.chat_info = [sender]
                
                # 4 - click on each chat and scrape the content
                chats[i].click()
                #print(f"  > Scraping Chat No.{i}:\t {sender} \t [{self.string_from_date(self.criteria['date_min']} - {self.string_from_date(self.criteria['date_max']}]""
                chat_content = self.skype_scrape_chat()

                # 5 - save to file
                self.save_data("skype_%s" %(sender), ext)
        
        if(len(chats_to_scrape) != 0):
            skipped_chats = re.sub("\[|\]", "", str(chats_to_scrape))
            print("\t*** Did not scrape non-existing chat(s): %s ***" %(skipped_chats))
        print("==> skype_scrape end")

    def skype_scrape_chat(self):
        """
        __ Description __
        scrolls the skype chat, until the "scrape_filters" are satisfied e.g. reach 20th May 2018

        while scrolling, extract all the messages visible in the chat, avoiding duplicates
        """

        ################################## XPATH of chat
        messages_xp = "//div[@style='position: relative; display: flex; flex-direction: row; flex-grow: 1; flex-shrink: 1; overflow: hidden; align-items: stretch; background-color: rgb(255, 255, 255);']/div/div[2]/div/div/div/div/div/div/div/div/div/div/div/div/div/div[2]/div[@role='region']"
        ########################################
        
        # 1 - click on the bottom of the chat after it has loaded
        self.supp_wait_for_xpath(messages_xp, "at_least_one_message_in_chat")
        all_messages = self.driver.find_elements_by_xpath(messages_xp)
        topMessage_ID_old = all_messages[0].id
        ActionChains(self.driver).move_to_element(all_messages[-1]).click().perform()


        # 2 - prepare variables for scraping
        scraped_messages = []           # cumulative array of all the scraped_messages
        oldest_date =  "Undefined"
        continue_scroll = True
        current_number_of_scrolls = 0
        scroll_stall = 0            # counter to check how long scrolling has been stalled for
        print("    ", end="")
        
        while (continue_scroll):

            # 3 - scroll the chat
            ActionChains(self.driver).send_keys(Keys.PAGE_UP).perform()

            # 4 - get all the scraped_messages in the current scope of the all_messages
            all_messages = self.driver.find_elements_by_xpath(messages_xp)
            topMessage_new = all_messages[0]
            topMessage_ID_new = topMessage_new.id

            if(topMessage_ID_new == topMessage_ID_old):
                # 5 - if the top of the chat has not updated, jitter the chat by clicking and scrolling up and down
                print("*", end = "")
                ActionChains(self.driver).move_to_element(topMessage_new).click().perform()
                scroll_stall += 1
                ActionChains(self.driver).send_keys(Keys.PAGE_DOWN).perform()
                
            else:
                # 6 - otherwise continue scrolling
                print(".", end = "")
                scroll_stall = 0

            # 7 - extract all scraped_messages and reverse the order so thatthey go NEW -> OLD
            # (instead of the OLD -> NEW that top down scraping gives)
            messages_to_add = self.skype_scrape_chat_visible()
            messages_to_add = messages_to_add[::-1]

            # 8 - store all scraped_messages with defined dates
            for i in messages_to_add:
                if(i[1] != "Undefined"):
                    scraped_messages.append(i)
                    oldest_date = i[1]            

            # 9 - check if scroll conditions are still satisfied and repeat loop
            continue_scroll = self.skype_scrollCheck_date(self.criteria,
                                                          {"current_date": oldest_date,
                                                           "current_number_of_stalls": scroll_stall})

            # 10 - reset variables next loop
            topMessage_ID_old = topMessage_ID_new
            current_number_of_scrolls +=1
           
        # 11 - store the thefiltered content to pandas DataFrame
        skype_class.skype_format_messages(scraped_messages, self.criteria)
               
        # 12 - notify about result of scrolling
        filter_target_date = self.criteria['date_max']
        # if we happen be without a defined date (early on in the scrolling), define is as today
        if(oldest_date != "Undefined"):
            oldest_date = self.date_from_string(oldest_date)
        else:                           
            today = datetime.datetime.today()
            oldest_date = [today.year, today.month, today.day]
        if (self.datetime_from_date(oldest_date) <= self.datetime_from_date(filter_target_date)):
            print("  ✔ Reached target date: %s\n"%(self.string_from_date(filter_target_date)))
        else:
            print("  ✘ DID NOT REACH TARGET DATE: %s/%s\n"
                  %(self.string_from_date(oldest_date), self.string_from_date(filter_target_date).upper()))

        return scraped_messages
        

    def skype_scrape_chat_visible(self):
        """
        __ Description __
        goes through the messages visible on the screen and extracts [sender, date, message]

        ⦿⦿⦿ date is of the form "09 March 2019" or "Monday" ⦿⦿⦿

        __ Return __
        [1D-(sender, date, message)]  array of tuples holding info on each message.
        """
        ######################################## Attributes and names to identify messages in cconvof
        soup_chat_message = [["div", {"role": ["region", "heading"], "tabindex": re.compile("(-1|0)"), "aria-label": re.compile(".")}]]
        ########################################

        
        # 1 - get html of the page and look for messages with beautiful soup
        soup = self.supp_load_soup()
        chatContent = self.supp_extract_html(soup, soup_chat_message)

        messages = []
        date_current = "Undefined"
        
        for i in chatContent:
            # 2 - extract contents of the messages
            message_content = i["aria-label"]

            # a - date extraction
            date = re.match(
                re.compile("((\d{2}\s(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4})|(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))"),
                            message_content)
            if(date):
                date = date.group(1)
                date_current = date

            # b - sender extraction
            #     sender comes before the main part of the message e.g. "YAU, shall we go ...."
            sender = re.search(re.compile("(^\w+(\s\w+)?)(,)"), message_content)
            if(sender):
                sender = sender.group(1)
            else:
                sender = None

            # c - message: comes after the sender with a comma and before the time sent e.g. "yau, SHALL WE GO..., sent at 18:00"
            message = re.search(re.compile("(^\w+(\s\w+)?,)(.*)(, sent at \d{2}:\d{2})"), message_content)
            if(message):
                message = message.group(3)
            else:
                message = None

            # 2 - store the message if it was NOT a date (e.g. 09 March 2019)
            if((not date) and message):
                messages.append((sender, date_current, message))

        return messages

    def skype_format_messages(self, messages_to_format, critetia):
        """
        __ Parameters __
        [arr] messages_to_format:   list of tuples of the form
                                    (sender, dateString, message)

        __ Description __
        the list of messages is scanned and:
        - messages with an unassigned dates ("Undefined") are given a date
        - duplicate messages are removed
        - messages newer than a certain values are removed

        __ Return __
        [pd.DataFrame] pandas_out: dataFrame with all the messages_to_format
        """

        message_to_format = messages_to_format[::-1] # revese the message order, so that oldest (with defined date) are on top
        self.reset()

        # setup dates
        date_min = criteria['min_date']
        date_min = datetime.datetime(date_min[0], date_min[1], date_min[2])
        date_max = criteria['max_date']
        date_max = datetime.datetime(date_max[0], date_max[1], date_max[2])
        running_date = "Undefined" # date that keeps track of where we are in the mssages
        
        # 1 - iterate the messages, resolving any dates that were not extracted during scrolling
        for i in messages_to_format:

            date = i[1]

            if(date == "Undefined"):
                # a - if a date was not defined, look at the previously defined date
                date = running_date

            else:
                # b - convert date to [year, month, day]
                date = self.date_from_string(date)
                running_date = date # store the running date, so that further dates can be infered from it
                
            # 2 - write messages with defined dates that fall in the defined filter region
            if(date != "Undefined"):

                date = self.datetime_from_date(date)
                
                if((date >= date_min) and (date <= date_max)):
                    date_string = date.strftime("%d %B %Y")
                    # 2 - store the message in a dataframe
                    message_to_store = {"From": i[0],
                                        "Date": date_string,
                                        "Message": i[2]}

                    self.pandas_scraped = self.pandas_scraped.append(message_to_store, ignore_index=True)

        # 3 - remove duplicate entries due to scrolling overlap
        self.pandas_scraped = self.pandas_scraped.drop_duplicates()

In [12]:
class wait_for_chat_update():
  """Checking that Skype chat has updated after scrolling has been performed

  To be used in the following way:
  formWebDriverWait.until(wait_for_chat_update(old_top_message))
  """

  def __init__(self, top_message_text_old):
      self.top_message_text_old = top_message_text_old
      
  def __call__(self, driver):
    """
    __ Description __
    compares the id of the top message after scrolling. 
    if scrolling has stopped (end of conversation of loading) the id will remain the same

    __ Return __
    True: if text stayed the same - need to perform a click action
    False: if text has changed - can continue scrolling
    """

    ########################################xpaths
    chatBox_xpath = "//div[@style='position: relative; display: flex; flex-direction: row; flex-grow: 1; flex-shrink: 1; overflow: hidden; align-items: stretch; background-color: rgb(255, 255, 255);']"
    ########################################
    
    chatBox = driver.find_element_by_xpath(chatBox_xpath).find_elements_by_xpath("//div[@role='region']")

    top_message_text_new = chatBox[0].id
    
    if(top_message_text_new == self.top_message_text_old):
        # print("clicked and no change")
        return False
    else:
        # print("clicking has caused content to load")
        return True

In [14]:
########################################
########################################
skype_id = "🍄"
password = "🍄🍄"
timeout = 10                      # seconds to wait for page elements to load
browser = "chrome"                # firefox of chrome
########################################
########################################
skype_class = skype_bot(browser, timeout)
#skype_class.login(skype_id, password)

  > Starting new Chrome server
==> setup_browser end



In [18]:
########################################
########################################
chats_to_extract = [0,1]#[0, 1, 2, 4, 6, 10]    # chats to extract, given by index
max_number_of_stalls = 14
min_date = [2019, 2, 1]        # either None or [2018, 1, 1]
max_date = [2019, 4, 20]        # either None or [2018, 1, 1]
########################################
########################################
skype_class.skype_scrape_setup(chats_to_extract, max_number_of_stalls, min_date, max_date)
skype_class.skype_scrape()

==> skype_scrape_setup start
  > Maximal number of stalls:	 14
  > Minimum date:		 Friday, 01 Feb 2019
  > Maximal date:		 Saturday, 20 Apr 2019
==> skype_scrape_setup end

==> skype_scrape start


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"div/div/div[2]/div[1]/div"}
  (Session info: chrome=77.0.3865.90)
