# NHL Roster Optimization
### This notebook will perform the following:
* Build a database of game and roster data from the past 10 years.
* Perform statistical analysis on player traits and their affect on the game results.
* Use machine learning to predict an "ideal" championship roster.
* "Ideal" roster will be made up of 20 synthetic players
* Synthetic team will be compared to an active team that is most closely coordinated to the synthetic team.

In [1]:
# Dependencies
## Data cleaning and plotting
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

## Database and webscraping
import pymongo
import os
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser

#### Setting up Mongo Database for NHL roster data

In [3]:
# Connect to MongoDB default port
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [8]:
# Defining database and collection
nhl_db = client.nhlstats_db
collection = nhl_db.articles

print(client.list_database_names())

['admin', 'config', 'local', 'nhlstats_db']


In [9]:
# Check if database exists
dblist = client.list_database_names()
if "nhlstats_db" in dblist:
    print('This database exists.')

This database exists.


#### Gather NHL historical data.

In [23]:
# Function to collect links for each season over the past decade.
base_url = 'https://www.hockey-reference.com/leagues'
season_prefix = 'https://www.hockey-reference.com'

# Using Splinter to navigate site and find image url 
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(base_url)
html = browser.html
nhl_soup = bs(html, 'html.parser')

##### Function to pull urls for each NHL season from the past 10 years, excluding the current season.

In [24]:
# Empty list to store season url's 
seasons = []

def get_season_urls(nhl_soup):
    """ Querying through html to find each season's url."""
    league_index = nhl_soup.find('div', {'id': 'all_league_index'})
    containers = league_index.find('div', {'class': 'table_outer_container'})
    tbody = containers.find('tbody')
    trs = tbody.find_all('tr')[:11]
    
    for tr in trs:
        """ Joining the url prefix with the desired url suffix."""
        season_url = season_prefix + tr.find('a')['href']
        seasons.append(season_url)
#     print(seasons)

season_links = get_season_urls(nhl_soup)
""" Assigning HTML links from seasons 09-10 to 18-19 """
desired_seasons = seasons[1:11:1]
desired_seasons

['https://www.hockey-reference.com/leagues/NHL_2019.html',
 'https://www.hockey-reference.com/leagues/NHL_2018.html',
 'https://www.hockey-reference.com/leagues/NHL_2017.html',
 'https://www.hockey-reference.com/leagues/NHL_2016.html',
 'https://www.hockey-reference.com/leagues/NHL_2015.html',
 'https://www.hockey-reference.com/leagues/NHL_2014.html',
 'https://www.hockey-reference.com/leagues/NHL_2013.html',
 'https://www.hockey-reference.com/leagues/NHL_2012.html',
 'https://www.hockey-reference.com/leagues/NHL_2011.html',
 'https://www.hockey-reference.com/leagues/NHL_2010.html']

##### Scan through each season and pull the link from every team during that season.

In [35]:
# Empty list to store team url's
teamYears = []

# Chromedriver
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# def get_teamSeason_urls(nhl_soup):
for link in desired_seasons:
    browser.visit(link)
    html = browser.html
    soup = bs(html, 'html.parser')

    """ Query the tbody """
    eas_clone = nhl_soup.find('div', {'id': 'div_standings_EAS_clone'})
    tbody = soup.find('tbody')

    """ Query the table """
    trs = tbody.find_all('tr', {'class', 'full_table'})
#     ths = trs.find_all('th')
#     a = trs.find('a')['href']

#       team_urls = teamYears.append(a)
    for tr in trs:
        """ Joining the url prefix with the desired url suffix."""
#         teamSeasons = desired_seasons + tr.find('a')['href']
#         seasons.append(teamSeasons)
        tr.find('a')['href']
#     print(seasons)
    print(tr)
        
# get_teamSeason_urls(nhl_soup)

<tr class="full_table" data-row="17"><th class="left" csk="72" data-stat="team_name" scope="row"><a href="/teams/NJD/2019.html">New Jersey Devils</a></th><td class="right" data-stat="games">82</td><td class="right" data-stat="wins">31</td><td class="right" data-stat="losses">41</td><td class="right" data-stat="losses_ot">10</td><td class="right" data-stat="points">72</td><td class="right" data-stat="points_pct">.439</td><td class="right" data-stat="goals">222</td><td class="right" data-stat="opp_goals">275</td><td class="right" data-stat="srs">-0.61</td><td class="right" data-stat="sos">0.04</td><td class="right" data-stat="points_pct_old">.384</td><td class="right" data-stat="ro_wins">28</td><td class="right" csk="65" data-stat="reg_rec">24-41-17</td><td class="right" data-stat="points_pct_reg">.396</td></tr>
<tr class="full_table" data-row="17"><th class="left" csk="77" data-stat="team_name" scope="row"><a href="/teams/NYR/2018.html">New York Rangers</a></th><td class="right" data-st