<a href="https://colab.research.google.com/github/frankwillard/NBA-Trade-Evaluation-Model/blob/main/Trade_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import string
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import bs4
import re
from datetime import date

In [7]:
test_url = 'https://www.basketball-reference.com/leagues/NBA_2021_transactions.html'
test_html = urlopen(test_url)
test_soup = BeautifulSoup(test_html, features="lxml")

In [8]:
transactions_list = test_soup.find_all('ul', {'class' : 'page_index'})[0]

Trades that parser needs to learn to deal with:


Multiteam deals:

Ex: In a 3-team trade, the Brooklyn Nets traded Saddiq Bey, Jaylen Hands, Dzanan Musa and a 2021 2nd round draft pick (JT Thor was later selected) to the Detroit Pistons; the Brooklyn Nets traded Jay Scrubb to the Los Angeles Clippers; the Detroit Pistons traded Bruce Brown to the Brooklyn Nets; the Detroit Pistons traded Luke Kennard, Justin Patton, a 2023 2nd round draft pick, a 2024 2nd round draft pick, a 2025 2nd round draft pick and a 2026 2nd round draft pick to the Los Angeles Clippers; the Los Angeles Clippers traded Reggie Perry and Landry Shamet to the Brooklyn Nets; and the Los Angeles Clippers traded Rodney McGruder and cash to the Detroit Pistons. 2023 2nd-rd pick is POR own 2024 2nd-rd pick is DET's own 2025 2nd-rd pick is DET's own 2026 2nd-rd pick is DET's own 2021 2nd-rd pick is TOR own


Trades with extra info at end:

Ex: The Milwaukee Bucks traded a 2022 2nd round draft pick (Max Christie was later selected) and a 2026 2nd round draft pick to the Orlando Magic for a 2020 2nd round draft pick (Jordan Nwora was later selected). 2026 2nd-rd pick is MIL own 2022 2nd-rd pick is IND own and could have been either 2022, 2023, or 2024



In [9]:
def multiple_replace(dict, text):
  """
  Applies multiple replaces in string based on dictionary
  Args:
      dict ([type]): Dictionary with keys as phrase to be replaced, vals as phrase to replace key
      text ([type]): Text to apply string replaces to
  Returns:
      [type]: [description]
  """
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [10]:
def parse_larger_trades(trade_string, num_teams):

  #Does not work as regex can be superset and thus combine trades (first trade has team_1 traded __, final trade has to the team_x)

  #for m in re.finditer(basic_pattern, trade_string, re.S):
  #  print(m.group('team_1'))

  statements = trade_string[trade_string.find('the'):].split('; ')
  basic_pattern = re.compile(r'the (?P<team_1>.*) traded (?P<assets_1>.*) to the (?P<team_2>.*)')

  team_lost_assets = {}
  team_earned_assets = {}

  for statement in statements:
    #and the Portland Trail Blazers traded Mario Hezonja to the Memphis Grizzlies. 
    statement = statement.replace("and the", "the")

    #Naive way to overcome:
    #2023 2nd-rd pick is more favorable of HOU and less favorable of MEM and DAL picks 2025 2nd-rd pick is MEM own
    end_of_sentence = statement.find(".")
    if end_of_sentence > 1:
      statement = statement[:end_of_sentence]


    match = basic_pattern.match(statement)
    if match != None:
      team_1 = match.group('team_1')
      team_2 = match.group('team_2')
      assets_1 = match.group('assets_1')

      assets_1_list = assets_1.replace(' and', ',').split(', ')

      team_lost_assets[team_1] = team_lost_assets.get(team_1, []) + assets_1_list
      team_earned_assets[team_2] = team_earned_assets.get(team_2, []) + assets_1_list
    else:
      print("case not covered for statement: ")
      print(statement)

    return team_lost_assets, team_earned_assets


In [11]:
def parse_simpler_trades(trade_string):
  basic_pattern = re.compile(r'The (?P<team_1>.*) traded (?P<assets_1>.*) to the (?P<team_2>.*) for (?P<assets_2>.*).')
  match = basic_pattern.match(trade_string)

  team_1 = match.group('team_1')
  team_2 = match.group('team_2')
  assets_1 = match.group('assets_1')
  assets_2 = match.group('assets_2')

  team_lost_assets = {}
  team_earned_assets = {}

  assets_1_list = assets_1.replace(' and', ',').split(', ')
  assets_2_list = assets_2.replace(' and', ',').split(', ')

  team_lost_assets[team_1] = assets_1_list
  team_lost_assets[team_2] = assets_2_list
  team_earned_assets[team_2] = assets_1_list
  team_earned_assets[team_1] = assets_2_list

  return team_lost_assets, team_earned_assets 

In [14]:
def parse_trade(trade_string):

  #Regex group names

  print(trade_string)

  three_plus_team_pattern = re.compile(r"In a (?P<num_teams>\d)-team trade, ")

  if three_plus_team_pattern.match(trade_string) :
    num_teams = int(three_plus_team_pattern.match(trade_string).group('num_teams'))
    team_lost_assets, team_earned_assets = parse_larger_trades(trade_string, num_teams)
  else:
    team_lost_assets, team_earned_assets = parse_simpler_trades(trade_string)

  print('Lost assets: ')
  print(team_lost_assets)

  print('Earned assets: ')
  print(team_earned_assets)

  return team_lost_assets, team_earned_assets

In [16]:
j=0
for li in transactions_list.find_all('li'):
  date = li.find('span').get_text()
  trade_string = li.find('p').get_text()

  if j == 10:
    break

  if 'trade' in trade_string:
    j+=1
    parse_trade(trade_string)


The Los Angeles Lakers traded Danny Green and Jaden McDaniels to the Oklahoma City Thunder for Dennis Schroder.
Lost assets: 
{'Los Angeles Lakers': ['Danny Green', 'Jaden McDaniels'], 'Oklahoma City Thunder': ['Dennis Schroder']}
Earned assets: 
{'Oklahoma City Thunder': ['Danny Green', 'Jaden McDaniels'], 'Los Angeles Lakers': ['Dennis Schroder']}
The Charlotte Hornets traded a 2024 2nd round draft pick to the New Orleans Pelicans for Nick Richards. 2024 2nd -rd pick is CHO's own
Lost assets: 
{'Charlotte Hornets': ['a 2024 2nd round draft pick'], 'New Orleans Pelicans': ["Nick Richards. 2024 2nd -rd pick is CHO's ow"]}
Earned assets: 
{'New Orleans Pelicans': ['a 2024 2nd round draft pick'], 'Charlotte Hornets': ["Nick Richards. 2024 2nd -rd pick is CHO's ow"]}
The Cleveland Cavaliers traded Jordan Bell and Alfonzo McKinnie to the Los Angeles Lakers for JaVale McGee and a 2026 2nd round draft pick. 2026 2nd-rd pick is LAL own Los Angeles also received a trade exception.
Lost assets:

TypeError: ignored

Need to be able to parse multiple players for both teams

2 for 1:
The {Team 1} traded {Player 1} and {Player 2} to the {Team 2} for {Player 3}

3+ for 1:
The {Team 1} traded {Player 1}, {Player 2}, and {Player 3} to the {Team 2} for {Player 4}

1 for 2:
The {Team 1} traded {Player 1} to the {Team 2} for {Player 2} and {Player 3}

1 for 3+:
The {Team 1} traded {Player 1} to the {Team 2} for {Player 2}, {Player 3}, {Player 4}


--->

The _____ traded _________ to the _____ for ________

Replace "and" with "," and split the blanks into commas

Need to extract individual elements:

Team 1
Team 2
List of players/assets from team 1
List of players/assets from team 2

Need to then remove irrelevant assets

Multiple team trades

In [46]:
trade_string = "In a 3-team trade, the Boston Celtics traded Desmond Bane to the Memphis Grizzlies; the Boston Celtics traded Enes Kanter to the Portland Trail Blazers; the Memphis Grizzlies traded a 2023 2nd round draft pick and a 2025 2nd round draft pick to the Boston Celtics; the Memphis Grizzlies traded cash to the Portland Trail Blazers; and the Portland Trail Blazers traded Mario Hezonja to the Memphis Grizzlies. 2023 2nd-rd pick is more favorable of HOU and less favorable of MEM and DAL picks 2025 2nd-rd pick is MEM own"

In [47]:
statements = trade_string[trade_string.find('the'):].split('; ')

In [48]:
three_plus_team_pattern = re.compile(r"In a (?P<num_teams>\d)-team trade, ")
num_teams = int(three_plus_team_pattern.match(trade_string).group('num_teams'))

In [73]:
three_plus_team_pattern.match(trade_string)

<re.Match object; span=(0, 19), match='In a 3-team trade, '>

In [49]:
basic_pattern = re.compile(r'the (?P<team_1>.*) traded (?P<assets_1>.*) to the (?P<team_2>.*)')

In [68]:
team_lost_assets = {}
team_earned_assets = {}

In [69]:
for statement in statements:
  
  #and the Portland Trail Blazers traded Mario Hezonja to the Memphis Grizzlies. 
  statement = statement.replace("and the", "the")

  #Naive way to overcome:
  #2023 2nd-rd pick is more favorable of HOU and less favorable of MEM and DAL picks 2025 2nd-rd pick is MEM own
  end_of_sentence = statement.find(".")
  if end_of_sentence > 1:
    statement = statement[:end_of_sentence]


  match = basic_pattern.match(statement)
  if match != None:
    team_1 = match.group('team_1')
    team_2 = match.group('team_2')
    assets_1 = match.group('assets_1')

    assets_list = assets_1.replace(' and', ',').split(', ')

    team_lost_assets[team_1] = team_lost_assets.get(team_1, []) + assets_list
    team_earned_assets[team_2] = team_earned_assets.get(team_2, []) + assets_list
  else:
    print(statement)

In [70]:
team_lost_assets

{'Boston Celtics': ['Desmond Bane', 'Enes Kanter'],
 'Memphis Grizzlies': ['a 2023 2nd round draft pick',
  'a 2025 2nd round draft pick',
  'cash'],
 'Portland Trail Blazers': ['Mario Hezonja']}

In [72]:
team_earned_assets

{'Memphis Grizzlies': ['Desmond Bane', 'Mario Hezonja'],
 'Portland Trail Blazers': ['Enes Kanter', 'cash'],
 'Boston Celtics': ['a 2023 2nd round draft pick',
  'a 2025 2nd round draft pick']}

In [35]:
#match = basic_pattern.match(trade_string)

In [82]:

#Does not work as regex can be superset and thus combine trades (first trade has team_1 traded __, final trade has to the team_x)

#for m in re.finditer(basic_pattern, trade_string, re.S):
#  print(m.group('team_1'))