# Lacrosse Statistics

Lacrosse is dominated by RPI, but that is outdated
What if there were a better ranking system?

In [2]:
import requests as reqs
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

In [4]:
# helper function for removing parenthesis from the end of strings
def remove_record(str):
  try:
    return re.match(r'^(.*) \(', str).group(1)
  except AttributeError:
    return str


headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

In [18]:
# getting laxELO data

df_elo = pd.DataFrame()

for i in range(1, 76):
  try:
    req = reqs.get('https://lacrossereference.com/teams/a00%02d' % (i))
    soup = BeautifulSoup(req.content)

    # Scrapping the team's name first
    # the team name is in a div with a unique set of CSS classes so it can be easily puled out with beautiful soup
    team_str = soup.find(class_='col-12 font-24 bold').get_text()
    # that div does contain the teams record as well, so this regular expression isolates the name
    team = remove_record(team_str)

    # Scrapping ELO data
    # The ELO table is unfortunately not a HTML table, but we can grab the div by its unique ID
    elo_table_string = soup.find(id='elo_change_table').get_text()

    # the table lists a starting ELO, and then the change in ELO after each game
    # here we pull out the starting ELO by targeting its div
    starting_elo = int(soup.find_all(class_="no-padding dtop")[1].get_text())

    # thanks to beautiful soup the rows of the table are seperated by 2 new lines and the columns by a single new line
    # splitting on rows and columns then trhowing out rows/columns that do not contain data gives a 2D array that pandas can turn into a dataframe
    elo_table = pd.DataFrame([x.split('\n')[1:] for x in elo_table_string.split('\n\n')[3:-1]])\

    # name columns, drop unnecessary ones and add a team name column
    elo_table.columns = ['opponent', 'score', 'date', 'elo']
    elo_table.drop(['opponent', 'score'], axis=1,  inplace=True)
    elo_table['team'] = team

    # convert date and ELO columns to datetimes and numerics
    elo_table['date'] = pd.to_datetime(elo_table['date'] + ' 2022', format='%b %d %Y')
    elo_table['elo'] = pd.to_numeric(elo_table['elo'])

    # add row for initial elo on 1/28, they day befor the first game of the season
    initial = {
      'team': team,
      'elo': starting_elo,
      'date': pd.to_datetime('2022-01-28'),
    }
    df_elo = df_elo.append(initial, ignore_index=True)

    # Since the ELO column is the game to game change, need to go through it and update it to be an absolute ELO
    for index, row in elo_table.iterrows():
      prev = starting_elo if index == 0 else elo_table.loc[index - 1, 'elo']
      elo_table.loc[index, 'elo'] = elo_table.loc[index, 'elo'] + prev

    # add the 
    df_elo = df_elo.append(elo_table)
  
  except Exception as err:
    print('Problem fetching data from: "https://lacrossereference.com/teams/a00%02d"' % (i))

Problem fetching data from: "https://lacrossereference.com/teams/a0021"
Problem fetching data from: "https://lacrossereference.com/teams/a0024"


Checking those lniks they appear to be for teams who were in D1 at one point but are no longer in D1 this year.

In [19]:
# games data

df_games = pd.DataFrame()

for date in pd.date_range('2022-01-29', datetime.now()):
  try: 
    url = f'https://stats.ncaa.org/season_divisions/17822/scoreboards?utf8=%E2%9C%93&season_division_id=&game_date={date.month}%2F{date.day}%2F2022&conference_id=0&tournament_id=&commit=Submit'

    req = reqs.get(url, headers=headers)
    games_table = pd.read_html(req.content)[0]
    for i in range(0, len(games_table), 5):
      game = games_table.loc[i:i+5]
      team1 = remove_record(game.loc[i, 'Teams.1'])
      team2 = remove_record(game.loc[i+3, 'Teams'])
      team1_score = int(game.loc[i, 'Score.1'])
      team2_score = int(game.loc[i + 3, 'Teams.1'])

      # let win = 1 and loss = 0 to make manipulation easy later
      team1_outcome = 1 if team1_score > team2_score else 0
      team2_outcome = 0 if team2_score > team1_score else 1

      game1 = {
        'date': date,
        'opponent': team2,
        'outcome': team1_outcome,
        'team': team1,
      }
      game2 = {
        'date': date,
        'opponent': team1,
        'outcome': team2_outcome,
        'team': team2,
      }

      df_games = df_games.append(game1, ignore_index=True)
      df_games = df_games.append(game2, ignore_index=True)
  
  except Exception as err:
    print('No games on ' + date.isoformat()[:10])




No games on 2022-01-30
No games on 2022-01-31
No games on 2022-02-01
No games on 2022-02-02
No games on 2022-02-03
No games on 2022-02-07
No games on 2022-02-08
No games on 2022-02-10
No games on 2022-02-14
No games on 2022-02-17
No games on 2022-02-19
No games on 2022-02-24
No games on 2022-02-25
No games on 2022-03-03
No games on 2022-03-07
No games on 2022-03-17
No games on 2022-03-21
No games on 2022-03-23
No games on 2022-03-24
No games on 2022-03-25
No games on 2022-03-28
No games on 2022-03-30
No games on 2022-03-31
No games on 2022-04-02
No games on 2022-04-04
No games on 2022-04-06
No games on 2022-04-07


In [71]:
# USILA
# rankings through week 9

month_mappings = {
  2: 'feb',
  3: 'march',
  4: 'april'
}
dates = [date for date in pd.date_range('2022-02-07', datetime.today(), freq='7D')]
urls = [f'https://usila.org/news/2022/{date.month}/{date.day}/mens-lacrosse-usila-dynamic-2022-mens-coaches-division-i-poll-week-of-{month_mappings[date.month]}-{date.day}.aspx' for date in dates]
urls += ['https://usila.org/news/2022/2/1/mens-lacrosse-usila-dynamic-2022-mens-coaches-division-i-poll-preseason.aspx']
dates += [pd.to_datetime('2022-02-01')]

df_usila = pd.DataFrame()


for url, date in zip(urls, dates):

  req = reqs.get(url, headers=headers)

  df = pd.read_html(req.content)[0]

  df.columns = ['team', 'rank', 'record', 'points', 'last']
  df.drop(['record', 'points', 'last'], axis=1, inplace=True)

  df['team'] = df['team'].apply(remove_record)

  df['date'] = date

  df_usila = df_usila.append(df)

df_usila

Unnamed: 0,team,rank,date
0,Virginia,1,2022-02-07
1,Duke,2,2022-02-07
2,Maryland,3,2022-02-07
3,Georgetown,4,2022-02-07
4,North Carolina,4,2022-02-07
...,...,...,...
15,Johns Hopkins,16,2022-02-01
16,Drexel,17,2022-02-01
17,Princeton,18,2022-02-01
18,Delaware,19,2022-02-01


In [19]:
# Inside Lacrosse

def get_date(str):
  preseason_date = '2022-01-24'
  first_week = '2022-02-14'
  if str == 'Preseason':
    return pd.to_datetime(preseason_date)
  weeks_past = int(str[-1])
  return pd.to_datetime(first_week) + pd.Timedelta(weeks_past, 'W')

x = reqs.get('https://www.insidelacrosse.com/league/di/polls/2022', headers=headers)
soup = BeautifulSoup(x.content)

tag = soup.find_all(class_='dropdown-menu')
base_url = 'https://www.insidelacrosse.com'
urls_with_date = [((base_url + link['href']), get_date(link.text)) for link in tag[1].contents]

df_ilax = pd.DataFrame()

for url, date in urls_with_date:
  x = reqs.get(url, headers=headers)
  rankings = pd.read_html(x.content)[0]["Men's Division I Media Poll"]
  tidyer = pd.DataFrame()
  tidyer['team'] = rankings['Team'].apply(remove_record)
  tidyer['rank'] = rankings['Rank']
  tidyer['date'] = date

  df_ilax = df_ilax.append(tidyer)

df_ilax

Unnamed: 0,team,rank,date
0,Maryland,1,2022-04-11
1,Georgetown,2,2022-04-11
2,Princeton,3,2022-04-11
3,Rutgers,4,2022-04-11
4,Yale,5,2022-04-11
...,...,...,...
15,Delaware,16,2022-01-24
16,Vermont,17,2022-01-24
17,Drexel,18,2022-01-24
18,Johns Hopkins,19,2022-01-24


Unnamed: 0,index,team,rank,date
0,0,Maryland,1,2022-04-11
1,1,Georgetown,2,2022-04-11
2,2,Princeton,3,2022-04-11
3,3,Rutgers,4,2022-04-11
4,4,Yale,5,2022-04-11
...,...,...,...,...
175,15,Delaware,16,2022-01-24
176,16,Vermont,17,2022-01-24
177,17,Drexel,18,2022-01-24
178,18,Johns Hopkins,19,2022-01-24


In [None]:
# setting up to compare
def get_elo_before(team, date):
  x = df_elo[(df_elo['team'] == team) & (df_elo['date'] < date)]
  return x.iloc[-1]['elo']

def get_rpi_before(team, date):
  return

team = 'Maryland'
date = datetime.today()

games_played = df_games[df_games['team'] == team]
opponents = games_played['opponent']

games_played['outcome'].sum()