In [1]:
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd 


In [2]:
driver = webdriver.Chrome('/usr/local/bin/chromedriver')

### Methods for finding a single element 

- find_element_by_id
* find_element_by_name
* find_element_by_xpath (*personal favorite*) 
* find_element_by_link_text
* find_element_by_partial_link_text
* find_element_by_tag_name
* find_element_by_class_name
* find_element_by_css_selector

### Methods for finding multiple elements
* find_elements_by_name
* find_elements_by_xpath
* find_elements_by_link_text
* find_elements_by_partial_link_text
* find_elements_by_tag_name
* find_elements_by_class_name
* find_elements_by_css_selector

From the [Selenium Python Docs](https://selenium-python.readthedocs.io/locating-elements.html "Selenium Docs") 

In [None]:
# Logging in sample 

def sign_in(username, password):
    driver.get('https://www.instagram.com/accounts/login/')

    user_input = driver.find_element_by_name('username')
    password_input = driver.find_element_by_name('password')
    user_input.send_keys(username)
    password_input.send_keys(password)
    password_input.send_keys(Keys.ENTER)
    time.sleep(2)
    #check for modal
    pres_check = driver.find_element_by_xpath("//div[@role='presentation']")
    if pres_check: 
        not_now_button = driver.find_elements_by_xpath("//*[contains(text(), 'Not Now')]")
        if not_now_button: 
            not_now_button[0].send_keys(Keys.ENTER)
    print("you should be signed in now!")


In [None]:
# Try / Except is a scrapers friend 

try: 

    post_list = driver.find_element_by_class_name('Ln-UN')

except Exception as e:

        print("post_list didn't work", e)

        try: 

            post_list2 = driver.find_element_by_class_name('eLAPa')

        except Exception as e:

            print("NoSuchElementException", e)
            
# also you can do it with a wait 

try: 
    
    element = driver.find_element_by_class_name('class-name')

except Exception as e:

        print("Looks like it can't find the element yet", e)
        time.sleep(2)

        try: 

            element_two_seconds_later = driver.find_element_by_class_name('class-name')

        except Exception as e:

            print("NoSuchElementException", e)

            

In [None]:
#Whoah! What's that modal? 
try:
    modal_button = driver.find_element_by_class_name("button2")
    modal_button.click()
    
except: 
    pass 

In [None]:
# Scroll down (with a test for a modal)

def scroll_down():
    for i in range(1,5):
        try:
            modal_button = driver.find_element_by_class_name("button2")
            webdriver.ActionChains(driver).move_to_element(modal_button).click(modal_button).perform()
            # modal_button.click() also works 
        except: 
            pass 
        #scroll to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

# "Load more stories"
def get_more(): 
    for i in range(1,5):
        try:
            next_b = driver.find_element_by_xpath("//*[contains(text(), 'Load next Politics story')]")
            webdriver.ActionChains(driver).move_to_element(next_b).click(next_b).perform()
            time.sleep(.5)
        except: 
            pass 

In [None]:
#some other fun stuff 
driver.get_screenshot_as_file('google.png')

In [5]:
# Explicit Waits 

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver.get("http://somedomain/url_that_delays_loading")
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "myDynamicElement"))
    )
finally:
    driver.quit()
    
# This waits up to 10 seconds before throwing a TimeoutException 
# unless it finds the element to return within 10 seconds. 
# WebDriverWait by default calls the ExpectedCondition every 
# 500 milliseconds until it returns successfully.
# here are 
# title_is
# title_contains
# presence_of_element_located
# visibility_of_element_located
# visibility_of
# presence_of_all_elements_located
# text_to_be_present_in_element
# text_to_be_present_in_element_value
# frame_to_be_available_and_switch_to_it
# invisibility_of_element_located
# element_to_be_clickable
# staleness_of
# element_to_be_selected
# element_located_to_be_selected
# element_selection_state_to_be
# element_located_selection_state_to_be
# alert_is_present

# -------------------------------------------------------------------

# wait = WebDriverWait(driver, 10)
# element = wait.until(EC.element_to_be_clickable((By.ID, 'someid')))

# -------------------------------------------------------------------

# Implicit Waits 

# driver.implicitly_wait(10) # seconds
# driver.get("http://somedomain/url_that_delays_loading")
# myDynamicElement = driver.find_element_by_id("myDynamicElement")

# An implicit wait tells WebDriver to poll the DOM for a certain amount 
# of time when trying to find any element (or elements) not immediately 
# available. The default setting is 0. Once set, the implicit wait is set 
# for the life of the WebDriver object.

# I tried this with varying degrees of success. I still ended up with  
# NoSuchElement errors and found more success with try / except and 
# being explicit about a wait time. 


In [None]:
# I love get_attribute 
element.get_attribute("attribute name")

attribute_value = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, "org"))).get_attribute("attribute_name")

In [6]:
url = "http://www.tennisabstract.com/cgi-bin/player.cgi?p=RogerFederer"

In [9]:
# url = "https://www.theonion.com/c/news-in-brief"
driver.get(url)


In [10]:
head_tab = driver.find_element_by_id("tabHead")

In [11]:
head_tab

<selenium.webdriver.remote.webelement.WebElement (session="2838426271de1a5b659be54bcc723f11", element="0.899731229174775-1")>

In [12]:
head_tab.click()

In [19]:
table = driver.find_element_by_id("matches")

In [25]:
body = table.find_element_by_css_selector('tbody')

In [60]:
# body.get_attribute('innerHTML')

In [26]:
rows = body.find_elements_by_css_selector('tr')

In [27]:
len(rows)

362

In [30]:
rows[0].get_attribute('innerHTML')

'<td class="NovakDjokovic" align="right"><span class="likelink h2hclick">47</span></td><td><a href="http://www.tennisabstract.com/cgi-bin/player.cgi?p=NovakDjokovic">Novak Djokovic</a><span> [SRB]</span></td><td align="right">22</td><td align="right">25</td><td align="right">46.8%</td><td align="right">24</td><td align="right">12</td><td align="right">12</td><td align="right">50.0%</td><td align="right" title="WIN: 2006 Monte Carlo Masters R64 (Clay), 6-3 2-6 6-3">17‑Apr‑2006</td><td align="right" title="LOSS: 2018 Paris Masters SF (Hard), 7-6(6) 5-7 7-6(3)">29‑Oct‑2018</td><td align="right">46</td><td align="right">1.02</td><td align="right">9.4%</td><td align="right">2.3%</td><td align="right">61.4%</td><td align="right">72.9%</td><td align="right">50.9%</td><td align="right">64.4%</td><td align="right">36.2%</td><td align="right" title="201/331">60.7%</td><td align="right" title="126/353">35.7%</td>'

In [61]:
row_data = rows[0].find_elements_by_css_selector('td')

In [63]:
# for e in row_data: 
#     print(e.text)

In [44]:
# mega_list = []
for r in rows: 
    row_list = []
    row_data = r.find_elements_by_css_selector('td')
    for d in row_data: 
        row_list.append(d.text)
    mega_list.append(row_list)

In [47]:
mega_list[10]

['19',
 'David Nalbandian [ARG]',
 '11',
 '8',
 '57.9%',
 '9',
 '4',
 '5',
 '44.4%',
 '15‑Apr‑2002',
 '20‑Jun‑2011',
 '19',
 '1.07',
 '9.8%',
 '2.6%',
 '58.5%',
 '74.4%',
 '44.3%',
 '61.9%',
 '40.9%',
 '54.7%',
 '47.6%']

In [32]:
len(row_data)

22

In [52]:
federer_h2h = pd.DataFrame(mega_list[1:], columns=mega_list[0])

In [67]:
federer_h2h[federer_h2h['DR'] == '-']

Unnamed: 0,H2Hs,Opponent,W,L,Win%,TB,W.1,L.1,TB%,First Match,...,DR,A%,DF%,1stIn,1st%,2nd%,SPW,RPW,BPSvd%,BPCnv%
119,4,Armando Brunold [SUI],3,1,75.0%,1,0,1,0.0%,23‑Aug‑1997,...,-,-,-,-,-,-,-,-,-,-
144,3,Yves Allegro [SUI],2,1,66.7%,2,1,1,50.0%,23‑Aug‑1997,...,-,-,-,-,-,-,-,-,-,-
220,2,Rodolphe Gilbert [FRA],2,0,100.0%,0,0,0,-,1‑Mar‑1999,...,-,-,-,-,-,-,-,-,-,-
222,2,Uros Vico [ITA],2,0,100.0%,0,0,0,-,17‑Oct‑1998,...,-,-,-,-,-,-,-,-,-,-
223,2,Daniele Ceraudo [ITA],2,0,100.0%,1,0,1,0.0%,17‑Oct‑1998,...,-,-,-,-,-,-,-,-,-,-
224,2,Alexandre Strambini [SUI],0,2,0.0%,0,0,0,-,18‑Oct‑1997,...,-,-,-,-,-,-,-,-,-,-
225,2,Agustin Garizzio [ARG],0,2,0.0%,0,0,0,-,23‑Aug‑1997,...,-,-,-,-,-,-,-,-,-,-
226,2,Manuel Jorquera [ITA],2,0,100.0%,0,0,0,-,23‑Aug‑1997,...,-,-,-,-,-,-,-,-,-,-
288,1,Alan Mackin [GBR],1,0,100.0%,0,0,0,-,23‑Sep‑2005,...,-,-,-,-,-,-,-,-,-,-
310,1,Todd Martin [USA],1,0,100.0%,1,1,0,100.0%,9‑Feb‑2001,...,-,-,-,-,-,-,-,-,-,-


In [33]:
for e in row_data: 
    print(e.text)

47
Novak Djokovic [SRB]
22
25
46.8%
24
12
12
50.0%
17‑Apr‑2006
29‑Oct‑2018
46
1.02
9.4%
2.3%
61.4%
72.9%
50.9%
64.4%
36.2%
60.7%
35.7%


In [64]:
header = table.find_element_by_css_selector('thead')
# rows = body.find_elements_by_css_selector('tr')
header_elements = header.find_elements_by_css_selector('th')

# get_attribute('innerHTML')
len(header_elements)

22

In [39]:
#make an empty dict
hth_dict = {}

for e in header_elements: 
    hth_dict[e.text] = "XX"
    
hth_dict


{'H2Hs': 'XX',
 'Opponent': 'XX',
 'W': 'XX',
 'L': 'XX',
 'Win%': 'XX',
 'TB': 'XX',
 'TB%': 'XX',
 'First Match': 'XX',
 'Last Match': 'XX',
 'MS': 'XX',
 'DR': 'XX',
 'A%': 'XX',
 'DF%': 'XX',
 '1stIn': 'XX',
 '1st%': 'XX',
 '2nd%': 'XX',
 'SPW': 'XX',
 'RPW': 'XX',
 'BPSvd%': 'XX',
 'BPCnv%': 'XX'}

In [41]:
header_test = []
for e in header_elements: 
    header_test.append(e.text)
header_test

['H2Hs',
 'Opponent',
 'W',
 'L',
 'Win%',
 'TB',
 'W',
 'L',
 'TB%',
 'First Match',
 'Last Match',
 'MS',
 'DR',
 'A%',
 'DF%',
 '1stIn',
 '1st%',
 '2nd%',
 'SPW',
 'RPW',
 'BPSvd%',
 'BPCnv%']

In [42]:
mega_list = []
mega_list.append(header_test)

In [43]:
mega_list 

[['H2Hs',
  'Opponent',
  'W',
  'L',
  'Win%',
  'TB',
  'W',
  'L',
  'TB%',
  'First Match',
  'Last Match',
  'MS',
  'DR',
  'A%',
  'DF%',
  '1stIn',
  '1st%',
  '2nd%',
  'SPW',
  'RPW',
  'BPSvd%',
  'BPCnv%']]