# Initial Data Collection

In [1]:
import pandas as pd
import time
from datetime import datetime
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [2]:
teams = pd.read_csv('Data/team_names.csv')

teams['abbr1'] = teams['abbr1'].str.lower()
teams['abbr2'] = teams['abbr2'].str.lower()

teams['nickname'] = teams.apply(lambda x: x['full_name'][x['full_name'].rfind(' ') + 1:], axis = 1)

teams['abbr1'][teams['abbr1'] == 'arz'] = 'crd'
teams['abbr1'][teams['abbr1'] == 'blt'] = 'rav'
teams['abbr1'][teams['abbr1'] == 'clv'] = 'cle'
teams['abbr1'][teams['abbr1'] == 'hst'] = 'htx'
teams['abbr1'][teams['abbr1'] == 'wsh'] = 'was'

positions = ['QB', 'RB', 'FB', 'WR1', 'WR2', 'TE', 'LT', 'LG', 'C ', 'RG', 'RT', 
             'LDE', 'LDT', 'RDT', 'RDE', 'LLB', 'MLB', 'RLB', 'LCB', 'RCB', 'SS', 'FS',
             'PR', 'KR']

yr_list = [2018, 2019, 2020, 2021]

In [3]:
game_dates = pd.read_csv('Data/spreadspoke_scores.csv')

game_dates['schedule_date'] = pd.to_datetime(game_dates['schedule_date'], format = '%m/%d/%Y')

game_dates = game_dates[game_dates['schedule_date'] >= '2018-06-01']

game_dates = game_dates[['schedule_date', 'team_home']]

In [4]:
team_dates = teams.merge(game_dates, left_on = 'full_name', right_on = 'team_home')

team_dates = team_dates[['abbr1', 'schedule_date', 'nickname']]

team_dates_dict = dict()

for t in team_dates['abbr1'].unique():
    
    team_dates_dict[t] = dict()
    
    team_dates_dict[t]['nickname'] = team_dates[team_dates['abbr1'] == t]['nickname'].unique()
    
    team_dates_dict[t]['dates'] = team_dates[team_dates['abbr1'] == t]['schedule_date'].unique()

In [5]:
def starting(abbreviation, team_dates):
        
    lineup_dict = dict()
    
    # Initialize the connection
    
    for dts in team_dates[abbreviation]['dates']:
    
        date = str(dts.astype('datetime64[D]'))
        
        print (dts)

        date = date[:4] + date[5:7] + date[8:10]

        url = 'https://www.pro-football-reference.com/boxscores/' + date + '0' + abbreviation + '.htm'

        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')

        driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
        driver.get(url)
        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find location of starter data

        count = 0

        table_loc = 0

        for s in soup.find_all(class_ = 'content_grid'):

            if s.get_text().find(team_dates[abbreviation]['nickname'][0] +  ' Starters') != -1:

                table_loc = count

            else:

                count += 1

        # Extract player list

        start_ind = 0

        parse_obj = soup.find_all(class_ = 'content_grid')[table_loc].prettify()

        player_list = []

        start_player = 1

        while start_player != -1:

            start_player = parse_obj.find('.htm"')
            end_player = parse_obj.find('</a')

            player_name = parse_obj[start_player + 6: end_player]

            player_list.append(player_name)

            start_ind = end_player + 3

            parse_obj = parse_obj[start_ind:]
        
        lineup_dict[dts] = dict()
        
        lineup_dict[dts]['home'] = player_list[:22]
        
        lineup_dict[dts]['away'] = player_list[22:-1]
    
    return lineup_dict
    

In [None]:
lineups = dict()

for abr in team_dates_dict:
    
    lineups[abr] = starting(abr, team_dates_dict)

2018-09-09T00:00:00.000000000


  driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)


In [None]:
url = 'https://www.pro-football-reference.com/boxscores/201809090min.htm'

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
driver.get(url)
time.sleep(20)

soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
count = 0

table_loc = 0

for s in soup.find_all(class_ = 'content_grid'):
    
    if s.get_text().find('Vikings Starters') != -1:
    
        table_loc = count
        
    else:
        
        count += 1
    

In [None]:
start_ind = 0

parse_obj = soup.find_all(class_ = 'content_grid')[table_loc].prettify()

player_list = []

start_player = 1

while start_player != -1:
    
    start_player = parse_obj.find('.htm"')
    end_player = parse_obj.find('</a')
            
    player_name = parse_obj[start_player + 6: end_player]
    
    player_list.append(player_name)
    
    start_ind = end_player + 3
        
    parse_obj = parse_obj[start_ind:]

In [None]:
player_list

# Misc

In [None]:
def starting(team, years, positions):
    
#     Initialize the connection
    
    url = 'https://www.pro-football-reference.com/teams/' + team + '/lineups.htm'

    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
    driver.get(url)
    time.sleep(20)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    page = soup.find(id = 'div_starting_lineups').get_text()
    
#     Find the parts of the text that are associated with each year
    
    yr_dict = dict()
    
    for yr in years:
    
        yr_dict[yr] = page.find(str(yr))
        
    table_dict = dict()

    for yr in years[::-1]:

        start_yr = years[yr]

        try:
            end_yr = years[yr-1]

        except:
            end_yr = 9999999999999

        table_dict[yr] = page[start_yr:end_yr]
        
#         Parse each year's table
        
    player_year = dict()
        
    for t in table_dict:
        
        player_info = table_dict[t].replace('*', '').replace('+', '')
                
        str_pos = []

        for p in positions:

            if p == 'WR1' and p != 'WR2':

                wr1 = player_info.find('WR')
                wr2 = player_info.rfind('WR')

                str_pos.append(wr1)
                str_pos.append(wr2)

            elif p != 'WR1' and p != 'WR2':

                str_pos.append(player_info.find(p))

        player_dict = dict()

        for p in range(len(off_str_pos)):

            if p < (len(off_str_pos) - 1):

                player_dict[positions[p]] = (player_info[str_pos[p]: str_pos[p + 1]])

                if positions[p] == 'WR1' or positions[p] == 'WR2':

                    player_dict[positions[p]] = player_dict[positions[p]].replace('WR', '').strip()

                else:

                    player_dict[positions[p]] = player_dict[positions[p]].replace(positions[p], '').strip()

            else:

                player_dict[positions[p]] = player_info[str_pos[p]:].replace(positions[p], '').strip()
                
        player_year[t] = player_dict
        
    return player_year


In [None]:
import requests

In [None]:
certificate_folder = '/usr/local/share/ca-certificates/'

!wget -P $certificate_folder https://artifactory.chrobinson.com/artifactory/automated-software-storage/ca-certificates/CHR_root.crt

In [None]:
# Initialize results dictionary

intermediate_dwell_dict = dict()

# Certificate location

certificate_location = certificate_folder + '/CHR_root.crt'

# Create list to loop through needs to be more than 1 port

unloc_list = ['USLAX', 'USLGB', 'USNYC']

for unloc in :

    new_api = requests.get(url = 'https://port-dwell-vis.api.chrazure.cloud/port_dwell/' + unloc, 
                           verify = certificate_location)
            
    intermediate_dwell_dict[unloc] =  new_api.json()
    
new_data = pd.DataFrame.from_dict(intermediate_dwell_dict).transpose()  # creates dataframe with specified data

new_data.to_csv(r"new_FILEPATH") #saves data to a csv to import to PBI Dashboard

In [None]:
!wget -P /usr/local/share/ca-certificates/ https://artifactory.chrobinson.com/artifactory/automated-software-storage/ca-certificates/CHR_root.crt

In [None]:
!wget -P /usr/local/share/ca-certificates/  https://artifactory.chrobinson.com/artifactory/automated-software-storage/ca-certificates/CHR_intermediate.crt

In [None]:
!apt-get -y install ca-certificates && apt-get clean && update-ca-certificates

In [None]:
# resp = requests.get('https://port-dwell-vis.api.chrazure.cloud/port_dwell/USLAX', 
#                      verify=['/usr/local/share/ca-certificates/CHR_intermediate.crt',
#                           '/usr/local/share/ca-certificates/CHR_root.crt'])

resp = requests.get('https://port-dwell-vis.api.chrazure.cloud/port_dwell/USLAX', 
                     verify='/usr/local/share/ca-certificates/CHR_root.crt')
