In [None]:
from selenium import webdriver
from time import sleep
import pandas as pd
import string
import bs4

def partial_txt(b, txt, p = True):
    '''Finds elements by their link text. User enters webdriver browser object and partial text.
    Defaults to finding all elements p (for plural) equals true. User can change to False to find
    first element'''
    return b.find_elements_by_partial_link_text(txt) if p else b.find_element_by_partial_link_text(txt)

def tag_name(b, tag, p = True):
    '''Finds elements by their tag name. It will find all elements if the default p (plural) argument is True.'''
    return b.find_elements_by_tag_name(tag) if p else b.find_element_by_tag_name(tag)

def class_name(b, tag, p = True):
    '''Finds an element by its class name. Finds all elements if the default p (plural) argument is True.'''
    return b.find_elements_by_class_name(tag) if p else b.find_element_by_class_name(tag)

def contribute(b):
    '''Checks if contribute-home pop-up is up, which prompts user to contribute their salary.'''
    keywords = ['contribute', 'submit']
    url_end = b.current_url.split('/')[-1]
    result = [True if k in url_end.lower() else False for k in keywords]
    return True if any(result) else False

def last_window(b):
    '''Switches active window to last active window.'''
    b.switch_to.window(b.window_handles[-1])

def splt_punct(w, p):
    '''Helper function to split column names on punctuation.'''
    res = None
    for l in w:
        if l in p:
            res = w.split(l)
            #Want it to break after finding first occurrence of punctuation in the word.
            break
    return w if not res else res

def iter_get_att(b, tag, **kwargs):
    '''Helper function supporting iteration of data to see if an attribute name is equal to a value.'''
    res = None
    for x in tag_name(b, tag):
        boo = [True if v and x.get_attribute(k) == v else False for (k, v) in kwargs.items()]
        if all(boo) and len(boo) == len(kwargs):
            res = x
            break
    return res

##Haven't tested yet
def pagination_flow(b):
    pgs = {'class': 'pagination'}
    looking_for = iter_get_att(b, 'ul', **pgs)
    pages = [tag_name(tn, 'a', p = False) for tn in tag_name(looking_for, 'li')]
    #Make below cleaner with a dict and then use dict comp to finalize.
    pagination = {k: pages[v] for (k, v) in {'left': 0, 'right': -1, 'min': 1, 'max': -2}.items()}
    return pagination

def job_information(b, tag):
    '''Grabs job information per page. Returns job_dictionary with name key and selenium obj value.'''
    dropdown = tag_name(b, tag, p = False)
    dropdown.click()
    jobs = tag_name(dropdown, 'option')
    job_dict = {j.text: jobs[i] for (i, j) in enumerate(jobs)}
    dropdown.click()
    return job_dict

def rows_pp(b, number = '100'):
    '''Selects the number of rows in the levels.fyi table per page. Default is 100. Note: This is one
    function that can be vastly improved and/or can lead on other funcs (like iter_get_att).'''
    rows_dropdown = None
    for t in tag_name(b, 'button'):
        class_type = t.get_attribute('class')
        if class_type == 'btn btn-default dropdown-toggle':
            spn = tag_name(t, 'span')
            if spn:
                rows_dropdown = spn[0]
    rows_dropdown.click()
    rows_per_page = {}
    for t in tag_name(b, 'ul'):
        c = t.get_attribute('class')
        if c == 'dropdown-menu':
            lis = tag_name(t, 'li')
            for l in lis:
                rows_per_page[l.text] = l
    rows_per_page[number].click()
    
def add_dict_list(d, k, v):
    '''Helper function to add items to list in df dictionary. Keys already exist.'''
    existing = d[k]
    existing.append(v)
    d[k] = existing
    
def extract_table(rs, cnames, b):
    '''Extracts tables from the tables of compensation data on levels.fyi site.'''
    #A dict to hold result values.
    df_cols = {nc: [] for nc in cnames}
    #Tagging on additional attributes not derived from the table itself.
    df_cols = {**df_cols, **{'Gender': [], 'Race': [], 'Academic': []}}
    #Since index 0 is the colnames above.
    for r in rs[1:]:
        #Columns
        cols = r.find_all('td')
        comp, loc_dt = cols[1].text.split('\n')
        add_dict_list(df_cols, 'Company', comp)
        loc, dt = loc_dt.split(' | ')
        add_dict_list(df_cols, 'Location', loc)
        add_dict_list(df_cols, 'Date', dt)
        level, tag = cols[2].text.split('\n')
        if len(level) > 0:
            add_dict_list(df_cols, 'Level Name', level)
        else:
            add_dict_list(df_cols, 'Level Name', '')
        if len(tag) > 0:
            add_dict_list(df_cols, 'Tag', tag)
        else:
            add_dict_list(df_cols, 'Tag', '')
        yrs_comp, yrs_exp = cols[3].text.split(' / ')
        add_dict_list(df_cols, 'Years at Company', yrs_comp)
        add_dict_list(df_cols, 'Years of Experience', yrs_exp)
        c_breakdown = False
        for x in cols[-1].find_all('span'):
            if x.text:
                if '|' in x.text:
                    c_breakdown = True
                    base, stock, bonus = x.text.split(' | ')
                    add_dict_list(df_cols, 'Base', base)
                    add_dict_list(df_cols, 'Stock', stock)
                    add_dict_list(df_cols, 'Bonus', bonus)
                elif '$' in x.text:
                    total_comp = x.text
                    add_dict_list(df_cols, 'Total Compensation', total_comp)
        if not c_breakdown:
            add_dict_list(df_cols, 'Base', '')
            add_dict_list(df_cols, 'Stock', '')
            add_dict_list(df_cols, 'Bonus', '')
    
    #Asking selenium to find tables now to begin clicking on rows to grab extra data.
    tab_sel = b.find_elements_by_tag_name('table')
    #Not all tables have rows - similar to comment above.
    row_sel = tab_sel[1].find_elements_by_tag_name('tr')
    #Since row zero is column names.
    for rs in row_sel[1:]:
        cols_sel = rs.find_elements_by_tag_name('td')
        if cols_sel:
            cols_sel[3].click()
    #Using beautiful soup again to grab the data more quickly....
    soup2 = bs4.BeautifulSoup(b.page_source, 'lxml')
    #Finding the table with bs.
    s2 = soup2.find_all('table')[1]
    #Looking at gender if available.
    for s in s2.find_all('span', {'class': 'gender-info'}):
        try:
            #Some values are text as integers. Making sure ignore.
            int(s.text)
        except ValueError:
            add_dict_list(df_cols, 'Gender', s.text)
    #Looking at Race and Academics if available.
    for s in s2.find_all('div', {'class': 'company-details'}):
        t = s.find_all('p', {'style': 'padding-left: 40px; padding-right: 40px; white-space: normal;'})
        for y in t:
            txt = y.text
            if 'Race' in txt:
                res = txt.split(':')
                ind = [res.index(i) for i in res if i.endswith('Race')][0]
                ind_plus = ind + 1
                if ',' in res[ind_plus]:
                    r, nada = res[ind_plus].split(',')
                    add_dict_list(df_cols, 'Race', r.strip())
                else:
                    add_dict_list(df_cols, 'Race', res[ind_plus].strip())
            else:
                add_dict_list(df_cols, 'Race', '')
            if 'Academic' in txt:
                res = txt.split(':')
                add_dict_list(df_cols, 'Academic', res[-1].strip())
            else:
                add_dict_list(df_cols, 'Academic', '')
    
    df = pd.DataFrame(df_cols)
    return df

In [None]:
#Establishing browser object.
levels_url = 'https://www.levels.fyi/'
b = webdriver.Firefox()
b.get(levels_url)

#Finding the View Salaries button to select it and begin looking for compensation data.
#Only one of these on the page so setting argument to false.
sal = partial_txt(b, 'View Salaries', p = False)
sal.click()

# #Moving to new active window.
last_window(b)

# #Popup window will come up asking for salary. Sleeping to allow to come up. Truncating it from url
# #and moving to new url to stick to grabbing data. 20 seconds sleep seems to be enough.
sleep(20)
if contribute(b):
    b.get(b.current_url.split('#')[0])
    
# #Since switched again to another url have to switch to new url.
last_window(b)

#------Eliminating geography filters. Only need to do this once------

#Finding button to start eliminating any filters on geography for view salaries search.
geog_dropd = {'data-toggle': 'dropdown', 'aria-haspopup': 'true'}
button = iter_get_att(b, 'button', **geog_dropd)
button.click()

#Getting menu object that identifies if any geography items are checked.
menu_drop = {'class': 'dropdown-menu dropdown-menu-right'}
menu = iter_get_att(b, 'ul', **menu_drop)

#Takes off any geographic filters in dropdown menu.
geog_filt = {'class': 'filter-by-region-menu'}
divs = iter_get_att(menu, 'li', **geog_filt)
#Can likely implement iter_get_att() again here, but not sure if really saves me space/time.
for d in tag_name(divs, 'div'):
    if d.get_attribute('class') == 'checkbox':
        inp = tag_name(d, 'input')
        if inp[0].get_attribute('checked'):
            inp[0].click()
#To close menu.                    
button.click()

#----------Setting up list of available jobs in the dropdown----------May need to do more than once.

job_dict = job_information(b, 'select')

#-----Setting rows per page displayed-----Have not implement iter_get_att() here. May write better code.---

#Perhaps start this section after job dict above. Writing below as if after job_dict above.

job_titles = list(job_dict.keys())

# new_colnames = None
new_colnames = []

dfs = []

errors = set()

#For each job in the dropdown.
for jt in job_titles:
    
    #Some jobs don't have enough data. In this case, we'll skip them and keep track of them.
    no_data = tag_name(b, 'h3', p = False)
    if no_data.text == "Let's get paid fairly":
        job_dict = job_information(b, 'select')
        next_job = job_dict[job_titles[job_titles.index(jt) + 1]]
        next_job.click()
        continue
        
    #Setting row number to 100 since when change jobs it reverts back to 10.
    rows_pp(b)
    
    #Obtaining pagination.
    pagination = pagination_flow(b)
    #Max number of pages per job in list.
    number_pages = int(pagination['max'].text)
    
    #Start with fewer amount of pages if want to test.
#     for p in range(0, 1):
    for p in range(0, number_pages):
        soup1 = bs4.BeautifulSoup(b.page_source, 'lxml')
        tab = soup1.find_all('table')
        #Multiple tables on the page. Index one is the one that has the data needed.
        rows = tab[1].find_all('tr')
        #Setting up column names.
        if not new_colnames:
            punct = list(string.punctuation)
            #First row at index zero is colnames.
            cn = [v for v in rows[0].text.split('\n') if v != '']
            colnames = []
            for v in cn:
                p = False
                for l in v:
                    if l in punct:
                        p = True
                        colnames.append(v.split(l))
                        break
                if not p:
                    colnames.append(v)
            for c in colnames:
                if isinstance(c, list):
                    for x in c:
                        new_colnames.append(x.strip())
                else:
                    new_colnames.append(c)
        #In case there is select missing data.
        try:
            df = extract_table(rows, new_colnames, b)
            #Adding job title to the df.
            df.loc[:, 'Job'] = jt
            dfs.append(df)
        except:
            #Adding job title and page to reference if cannot adequately collect data.
            errors.add((jt, p))
        
        #Next page.
        pagination['right'].click()
        #Resetting pagination since on a new page.
        pagination = pagination_flow(b)
        #sleep(1)
    #After finish first job need to switch to next. Will be on new page, so need to get new element.
    if jt == job_titles[-1]:
        break
    job_dict = job_information(b, 'select')
    next_job = job_dict[job_titles[job_titles.index(jt) + 1]]
    next_job.click()