In [26]:
# Import necessary packages
import json
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import requests
import time

In [6]:
def advancePage(browser):
    # Extract the navigation bar at the bottom of the page for navigation
    navBar = browser.find_elements_by_class_name('btn-group')
    
    # Advance to the next page by hitting the next button
    navBar[len(navBar)-1].click()    

In [7]:
def cleanUpData(player):
    for data in player:
        if player[data] == None:
            pass
        elif data in ['weight','combineBench']:
            player[data] = player[data].split(' ')[0]
        elif data in ['wingspan','lengthArm','lengthHand','combineBroad','combineVert']:
            player[data] = player[data].split('"')[0]
        elif data in ['combineCone','combine10split','combine20split','combine40dash','combine20shuttle','combine60shuttle']:
            player[data] = player[data].split('s')[0]
    return player

In [8]:
def createPlayerDict():
    varNames = ['nameFirst', 'nameLast', 'draftYear', 'college', 'heightFeet', 'heightInches','weight', 'wingspan', 
                'lengthArm', 'lengthHand', 'combine10split', 'combine20split', 'combine40dash', 'combineBench', 
                'combineVert', 'combineBroad', 'combineCone', 'combine20shuttle', 'combine60shuttle']
    playerDict = {}
    for var in varNames:
        playerDict[var] = None
    return playerDict

In [9]:
def fractionCheck(text):
    # For reference: https://www.compart.com/en/unicode/decomposition/%3Cfraction%3E
    fractions = [u"⅒",u"⅑",u"⅛",u"⅐",u"⅙",u"⅕",u"¼",u"⅓",u"⅜",u"⅖",u"½",u"⅗",u"⅝",u"⅔",u"¾",u"⅘",u"⅚",u"⅞"]
    fraction_values = [u".1",u".111",u".125",u".143",u".167",u".2",u".25",u".333",u".375",u".4",u".5",u".6",u".625",
                       u".666",u".75",u".8",u".833",u".875"]
    #for fraction in fractions:
    for a, b in zip(fractions, fraction_values):
        if a in text:
            return text.replace(a, b).encode('ascii','ignore').strip()      
    return text.encode('ascii','ignore').strip()

In [10]:
def getVariableName(name):
    variableDict = {'Height':'height', 'Weight':'weight', 'Wingspan':'wingspan', 'Arm Length':'lengthArm',
                    'Hand Size':'lengthHand', '10 Yard Split':'combine10split', '20 Yard Split':'combine20split',
                    '40 Yard Dash':'combine40dash', 'Bench Press':'combineBench', 'Vertical Jump':'combineVert',
                    'Broad Jump':'combineBroad', '3-Cone Drill':'combineCone', '20 Yard Shuttle':'combine20shuttle',
                    '60 Yard Shuttle':'combine60shuttle'}
    return(variableDict[name])

In [11]:
def makeURL(position):
    #baseURL = 'https://www.mockdraftable.com/search?position=QB&beginYear=1999&endYear=2018&sort=DESC&page='
    URL = 'https://www.mockdraftable.com/search?position=' + position + '&beginYear=1999&endYear=2018&sort=DESC&page=1'
    return URL

In [12]:
def pageNumberStatus(soup):
    selectedButton = soup.find_all('button',{'class':'btn btn-secondary active'})[-1]
    lastButton = soup.find_all('button',{'class':'btn btn-secondary'})[-2]
    return [int(selectedButton.text), int(lastButton.text)]

In [13]:
def retrievePlayerInfo(soup):
    # Initialize a player dictionary
    playerDict = createPlayerDict()

    # Retrieve basic player information
    playerName = soup.find('div',{'class':'mb-0 mt-1 h3 align-bottom playerbar-name'}).text.encode('ascii','ignore').strip()
    playerDict['nameFirst'] = playerName.split(' ')[0]
    playerDict['nameLast'] = playerName.split(' ')[1]
    for dd in soup.find_all('dd'):
        playerDict['draftYear'] = soup.find('a',{'data-reactid':'162'}).text.encode('ascii','ignore').strip()
        playerDict['college'] = soup.find('dd',{'data-reactid':'170'}).text.encode('ascii','ignore').strip()

    # Retrieve player measurables
    playerInfo = {}
    measureablesTable = soup.find('tbody')
    rows = measureablesTable.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        keyName = getVariableName(str(columns[0].text))
        if keyName == 'height':
            playerDict['heightFeet'] = columns[1].text.encode('ascii', 'ignore').replace("\'","").split(' ')[0]
            playerDict['heightInches'] = columns[1].text.encode('ascii', 'ignore').replace("\'","").split(' ')[1].replace('"','')
        else:
            value = fractionCheck(columns[1].text)
            playerDict[keyName] = value

    # remove unnecessary formatting from measurable values
    playerDict = cleanUpData(playerDict)

In [14]:
def retrievePlayerURL(soup,linkList):
    playerLinks = soup.find_all('a', {'class':'list-group-item list-group-item-action justify-content-between d-flex'})
    for link in playerLinks:
        linkList.append('https://www.mockdraftable.com' +link['href'])
    return linkList

In [15]:
def soupifyURL(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content,'lxml')
    return soup

In [27]:
# establish default header information
headers = {"User-agent":
           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}

# create a list of all the positions we'll be scraping
positionList = ['QB','FB','HB','WR','TE','OT','OG','OC','ST','DT','DE','EDGE','ILB','OLB','SS','FS','CB']
player_URL_List = []

#  Use the instructions found here to install PhantomJS on Ubuntu:
#       https://www.vultr.com/docs/how-to-install-phantomjs-on-ubuntu-16-04

# Open a PhantomJS web browser and direct it to the DEA's dropbox search page
#browser = webdriver.PhantomJS()
binary = FirefoxBinary('C:\\Program Files\\Mozilla Firefox\\firefox.exe')
browser = webdriver.Firefox(firefox_binary=binary, executable_path=r'E:\\Projects\\geckodriver.exe')
#browser = webdriver.Firefox()
browser.implicitly_wait(100)

WebDriverException: Message: Process unexpectedly closed with status 0


In [28]:
# Iterate through every position we want to scrape
for position in positionList:
    # make the first version of the URL
    url = makeURL(position)
    browser.get(url)
    pageStatus = [0, 1]    
    print(position)
    
    # Iterate through every subsequent page in the position group
    while (pageStatus[0] < pageStatus[1]):
        soup = soupifyURL(browser.current_url)
        pageStatus = pageNumberStatus(soup)
        #print('Current Page: ' + str(pageStatus[0]) + ', Next Page: ' + str(pageStatus[1]))
        
        player_URL_List = retrievePlayerURL(soup, player_URL_List)

        # advance the page
        advancePage(browser)
        time.sleep(3)

QB


KeyboardInterrupt: 

In [182]:
len(player_URL_List)

6405

In [437]:
# NEED TO FIX THIS SECTION
# RUNNING INTO AN ISSUE WITH INCORRECTLY SCRAPING:  DRAFT YEAR and SCHOOL

# MAKE SURE TO ADD IN POSITION AND PLAYER LINK

soup = soupifyURL('https://www.mockdraftable.com//player/aaron-brooks?position=QB')
playerDict = createPlayerDict()
dd = soup.find_all('dd')
print(dd[0].text.split('\n')[0])
#layerDict['draftYear'] = dd[0].text.encode('ascii','ignore').strip()
#layerDict['college'] = dd[2].text.encode('ascii','ignore').strip()
#playerDict

# Retrieve basic player information
#playerName = soup.find('div',{'class':'mb-0 mt-1 h3 align-bottom playerbar-name'}).text.encode('ascii','ignore').strip()
#playerDict['nameFirst'] = playerName.split(' ')[0]
#playerDict['nameLast'] = playerName.split(' ')[1]
print(soup.find('a',{'data-reactid':'162'}))
#playerDict['draftYear'] = soup.find('a',{'data-reactid':'162'}).text.encode('ascii','ignore').strip()
#playerDict['college'] = soup.find('dd',{'data-reactid':'170'}).text.encode('ascii','ignore').strip()

1999
None


In [405]:
playerList = []
urlCount = 0
for url in player_URL_List:
    soup = soupifyURL(url)
    print(url)
    playerList.append(retrievePlayerInfo(soup))
    if (urlCount%100==0): print(urlCount)
    urlCount+=1

https://www.mockdraftable.com//player/aj-mccarron?position=QB
0
https://www.mockdraftable.com//player/aaron-brooks?position=QB


AttributeError: 'NoneType' object has no attribute 'text'