# NBA recommendation engine

### URLs
- https://github.com/swar/nba_api/tree/master/docs/nba_api/stats/endpoints
- https://github.com/seemethere/nba_py/wiki/stats.nba.com-Endpoint-Documentation#parameters-39
- https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/commonplayerinfo.md

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [2]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import commonplayerinfo

## List of all active players

In [3]:
# get_players returns a list of dictionaries, each representing a player.
nba_players = pd.DataFrame(players.get_active_players())
print(f'Number of players fetched: {len(nba_players)}')

Number of players fetched: 519


In [4]:
nba_players

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,203500,Steven Adams,Steven,Adams,True
1,1628389,Bam Adebayo,Bam,Adebayo,True
2,200746,LaMarcus Aldridge,LaMarcus,Aldridge,True
3,1629734,Kyle Alexander,Kyle,Alexander,True
4,1629638,Nickeil Alexander-Walker,Nickeil,Alexander-Walker,True
...,...,...,...,...,...
514,201152,Thaddeus Young,Thaddeus,Young,True
515,1629027,Trae Young,Trae,Young,True
516,203469,Cody Zeller,Cody,Zeller,True
517,1627790,Ante Zizic,Ante,Zizic,True


In [6]:
def adj_position(pos):
    if pos[0] in ['C', 'F', 'G']:
        return pos[0]
    else:
        print('Please enter a valid position.')
        pass

## Preparation of dataset (time-consuming, only executed once)

### Get all career data for all active players

In [None]:
from nba_api.stats.endpoints import playercareerstats
import time

stats = list()

for player in tqdm(nba_players['id']):
    time.sleep(.600)
    call_career = playercareerstats.PlayerCareerStats(player_id=player) 
    # how to select data more efficiently, i.e. only last season?
    stats.append(call_career.get_data_frames()[0])

In [None]:
stats_df = pd.concat(stats)
stats_df.to_csv('playercareerstats.csv', index = False)

### Subset only last season data & add positions of players

In [None]:
playercareerstats = pd.read_csv('playercareerstats.csv')

In [None]:
stats_lastSeason = playercareerstats[playercareerstats['SEASON_ID'] == '2020-21'].reset_index().drop(columns=['index'])

In [None]:
# add positions of players

import time
positions = []
for i in tqdm(range(stats_lastSeason.shape[0])):
    time.sleep(.600)
    positions.append(commonplayerinfo.CommonPlayerInfo(stats_lastSeason['PLAYER_ID'][i]).get_data_frames()[0]['POSITION'][0])
print(len(positions))

from collections import Counter
Counter([adj_position(position) for position in positions])

stats_lastSeason['POSITION'] = [adj_position(position) for position in positions]

## Read data

In [44]:
#stats_lastSeason.to_csv('stats_lastSeason.csv', index = False)
stats_lastSeason = pd.read_csv('stats_lastSeason.csv')

In [45]:
# handle case that players where active for multiple teams last season -> only keep "TOT" (total) column

stats_LS = []
for player_id in stats_lastSeason["PLAYER_ID"].unique():
    stats_player = stats_lastSeason[stats_lastSeason["PLAYER_ID"] == player_id]
    if stats_player.shape[0] == 1:
        stats_LS.append(stats_player)
    else:
        stats_LS.append(stats_player[stats_player["TEAM_ABBREVIATION"] == 'TOT'])
        
stats_lastSeason = pd.concat(stats_LS).reset_index(drop = True)

In [46]:
stats_lastSeason

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,POSITION
0,203500,2020-21,0,1610612740,NOP,27.0,58,58,1605.0,189,...,213,301,514,111,54,38,78,113,438,C
1,1628389,2020-21,0,1610612748,MIA,23.0,64,64,2143.0,456,...,142,431,573,346,75,66,169,145,1197,C
2,200746,2020-21,0,0,TOT,35.0,26,23,674.0,140,...,19,99,118,49,11,29,27,47,352,C
3,1629638,2020-21,0,1610612740,NOP,22.0,46,13,1007.0,192,...,13,131,144,102,47,22,69,88,508,G
4,1628960,2020-21,0,1610612763,MEM,25.0,50,38,1260.0,173,...,19,141,160,108,46,8,48,71,532,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,1626153,2020-21,0,0,TOT,29.0,63,39,1748.0,240,...,65,204,269,278,101,30,83,75,645,G
431,201152,2020-21,0,1610612741,CHI,33.0,68,23,1652.0,370,...,168,255,423,291,74,40,137,152,823,F
432,1629027,2020-21,0,1610612737,ATL,22.0,63,63,2125.0,487,...,38,207,245,594,53,12,261,111,1594,G
433,203469,2020-21,0,1610612766,CHA,28.0,48,21,1005.0,181,...,119,209,328,86,27,17,51,121,451,F


## Salaries

In [15]:
url = 'https://www.spotrac.com/nba/cap/'

r = requests.get(url, timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')

salary_table = soup.find('table')

In [28]:
allr=[salary_table.find_all("td")[i].text.strip() for i in range(0,length)]


In [36]:
soup

<!DOCTYPE html >

<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en"> <![endif]-->
<!--[if IE 9]>    <html class="no-js ie9 oldie" lang="en"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<!-- start:global -->
<meta charset="utf-8"/>
<!-- end:global -->
<!-- start:page title -->
<title>NBA 2021-2022  Cap Tracker | Spotrac</title>
<!-- end:page title -->
<!-- start:meta info -->
<meta content="2021-2022,NBA, salary cap, space, team, rankings" name="keywords">
<meta content="A real-time look at the salary cap totals for each NBA team, including estimated cap space, positional spending, and cap types." name="description"/>
<meta content="NBA 2021-2022  Cap Tracker" property="og:title">
<meta content="Spotrac.com" property="og:site_name"/>
<meta content="https://d1dglpr230r57l.cloudfront.net/im

In [33]:
print(soup.find('class'))

None


In [34]:
df = pd.DataFrame(columns = ["Rank"])
n_col = 12

df.iloc[:,0] = [salary_table.find_all("td")[i].text.strip() for i in range(0,n_col-1)]
#for i in range(12):
    
#df = pd.DataFrame()

IndexError: iloc cannot enlarge its target object

In [29]:
allr

['1',
 'Oklahoma City ThunderOKC',
 '0.333',
 '15',
 '22.9',
 '$49,812,064',
 '$21,204,432',
 '$28,374,688',
 '$87,841,594',
 '$24,572,406',
 '$34,227,248',
 '',
 '2',
 'Memphis GrizzliesMEM',
 '0.600',
 '15',
 '23.7',
 '$97,935,345',
 '$39,210,321',
 '$16,157,252',
 '$115,761,775',
 '$-3,347,775',
 '$-1,678,597',
 '',
 '3',
 'New Orleans PelicansNOP',
 '0.091',
 '15',
 '24.6',
 '$114,929,039',
 '$55,467,800',
 '0 -',
 '$116,418,104',
 '$-4,004,104',
 '$-2,515,039',
 '',
 '4',
 'Charlotte HornetsCHA',
 '0.417',
 '15',
 '24.6',
 '$106,223,133',
 '$59,830,263',
 '$10,680,972',
 '$116,904,105',
 '$-4,490,105',
 '$-4,490,105',
 '',
 '5',
 'New York KnicksNYK',
 '0.636',
 '15',
 '25.9',
 '$111,024,770',
 '$52,367,977',
 '$6,431,666',
 '$117,456,436',
 '$-5,042,436',
 '$-5,320,205',
 '',
 '6',
 'San Antonio SpursSAS',
 '0.300',
 '15',
 '24.5',
 '$99,508,279',
 '$44,797,451',
 '$18,404,679',
 '$117,912,958',
 '$-5,498,958',
 '$-5,498,958',
 '',
 '7',
 'Detroit PistonsDET',
 '0.111',
 '16',
 '

In [16]:
length=len(salary_table.find_all("td"))

index = [salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]

column1=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column2=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]

In [27]:
[salary_table.find_all("td")[i].text.strip() for i in range(0,length,12)]

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30']

In [17]:
dict_df={'player_names':player_names,
        '2019/20':column1,
        '2020/21':column2,
        '2021/22':column3,
        '2022/23':column4,
        '2023/24':column5,
        '20124/25':column6}
        
df_salary=pd.DataFrame(dict_df)

In [21]:
salary_table

<table class="datatable rtable captracker">
<thead>
<tr>
<th class="center">Rank</th>
<th class="player">Team</th>
<th class="player">Win%</th>
<th class="center"><span class="info" title="Active Players">Signed</span></th>
<th class="center"><span class="info" title="Active Players">Avg Age</span></th>
<th class="center"><span class="info" title="The total cap $ for a team">Active  Cap</span></th>
<th class="center"><span class="info" title="The total cap $ for top 3 players for a team">Active Cap (Top 3)</span></th>
<th class="center"><span class="info" title="The total cap $ allocated to dead contracts">Dead Cap</span></th>
<th class="center"><span class="info" title="The total cap $ for a team">Total Cap</span></th>
<th class="center">Cap Space*</th>
<th class="center">Projected Practical<br/>Cap Space**</th>
<th class="center">Hard-Capped</th>
</tr>
</thead>
<tbody>
<tr class="parent">
<td class="center">1</td>
<td class="player noborderleft">
<a href="https://www.spotrac.com/redi

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

url='https://hoopshype.com/salaries/players/'


# for teams:
# https://www.spotrac.com/nba/cap/

In [2]:
r = requests.get(url, timeout=2.5)
r_html = r.text

soup = BeautifulSoup(r_html, 'html.parser')

salary_table = soup.find('table')

In [3]:
length=len(salary_table.find_all("td"))

player_names=[salary_table.find_all("td")[i].text.strip() for i in range(9,length,8)]

column1=[salary_table.find_all("td")[i].text.strip() for i in range(10,length,8)]
column2=[salary_table.find_all("td")[i].text.strip() for i in range(11,length,8)]
column3=[salary_table.find_all("td")[i].text.strip() for i in range(12,length,8)]
column4=[salary_table.find_all("td")[i].text.strip() for i in range(13,length,8)]
column5=[salary_table.find_all("td")[i].text.strip() for i in range(14,length,8)]
column6=[salary_table.find_all("td")[i].text.strip() for i in range(15,length,8)]

In [4]:
dict_df={'player_names':player_names,
        '2019/20':column1,
        '2020/21':column2,
        '2021/22':column3,
        '2022/23':column4,
        '2023/24':column5,
        '20124/25':column6}
        
df_salary=pd.DataFrame(dict_df)

In [5]:
df_salary

Unnamed: 0,player_names,2019/20,2020/21,2021/22,2022/23,2023/24,20124/25
0,Stephen Curry,"$45,780,966","$48,070,014","$51,915,615","$55,761,217","$59,606,817",$0
1,James Harden,"$44,310,840","$47,366,760",$0,$0,$0,$0
2,John Wall,"$44,310,840","$47,366,760",$0,$0,$0,$0
3,Russell Westbrook,"$44,211,146","$47,063,478",$0,$0,$0,$0
4,LeBron James,"$41,180,544","$44,474,988",$0,$0,$0,$0
...,...,...,...,...,...,...,...
529,Troy Williams,"$122,741",$0,$0,$0,$0,$0
530,Jabari Parker,"$100,000",$0,$0,$0,$0,$0
531,Mamadi Diakite,"$100,000",$0,$0,$0,$0,$0
532,Demetrius Jackson,"$92,857","$92,857","$92,857",$0,$0,$0


In [6]:
df_salary = df_salary.rename(columns={"20124/25": "2024/25"})

In [7]:
salary_years = ['2019/20', '2020/21', '2021/22', '2022/23', '2023/24', '2024/25']

In [9]:
for col in salary_years:
    df_salary[col] = df_salary[col].apply(lambda x: x.replace('$', '').replace(',', '')).astype(float)
    
# + also discount as in scratchpad.ipynb to get NPV?

In [10]:
df_salary

Unnamed: 0,player_names,2019/20,2020/21,2021/22,2022/23,2023/24,2024/25
0,Stephen Curry,45780966.0,48070014.0,51915615.0,55761217.0,59606817.0,0.0
1,James Harden,44310840.0,47366760.0,0.0,0.0,0.0,0.0
2,John Wall,44310840.0,47366760.0,0.0,0.0,0.0,0.0
3,Russell Westbrook,44211146.0,47063478.0,0.0,0.0,0.0,0.0
4,LeBron James,41180544.0,44474988.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
529,Troy Williams,122741.0,0.0,0.0,0.0,0.0,0.0
530,Jabari Parker,100000.0,0.0,0.0,0.0,0.0,0.0
531,Mamadi Diakite,100000.0,0.0,0.0,0.0,0.0,0.0
532,Demetrius Jackson,92857.0,92857.0,92857.0,0.0,0.0,0.0


## Class definition

In [42]:
class RecommendationEngine:
    def __init__(self, replacing_player):
        self.player_name = replacing_player        
        try:
            self.player_id = nba_players[nba_players["full_name"] == replacing_player]['id'].iloc[0]
        except KeyError:
            print("Please provide the full name of a valid active NBA player.")
        self.position = adj_position(commonplayerinfo.CommonPlayerInfo(self.player_id).get_data_frames()[0]['POSITION'][0])
            
    def recommend(self):
        stats = stats_lastSeason[(stats_lastSeason['POSITION'] == self.position)].drop(columns=['POSITION'])
        stats_repl_player = stats[stats['PLAYER_ID'] == self.player_id].iloc[:,5:]
        stats_num = stats.iloc[:,5:]
        
        if stats_repl_player.shape[0] != 0:
        
            # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
            model = NearestNeighbors(n_jobs = -1).fit(stats_num)

            res_players = model.kneighbors(stats_repl_player, return_distance = False)[0]
            res_player_0, res_player_1 = res_players[0], res_players[1]
            if stats.reset_index()['PLAYER_ID'][res_player_0] == self.player_id:
                print("Clustering worked.")
                
            res_player_id = stats.reset_index()['PLAYER_ID'][res_player_1]
            rec_player = nba_players[nba_players['id'] == res_player_id]['full_name'].iloc[0]

            print('Input Player:', self.player_name,'\nRecommended Player:', rec_player)
            return rec_player
        
        print("No data available for this player in the last season")
        pass
    
    def player_salary(self.)

In [13]:
def player_salary(player_name):
    return df_salary[df_salary['player_names'] == player_name]

def team_salary(team_name):
    

In [14]:
player_salary("Stephen Curry")

Unnamed: 0,player_names,2019/20,2020/21,2021/22,2022/23,2023/24,2024/25
0,Stephen Curry,45780966.0,48070014.0,51915615.0,55761217.0,59606817.0,0.0


## Exemplary execution

In [43]:
r1 = RecommendationEngine("Joel Embiid").recommend()

Clustering worked.
Input Player: Joel Embiid 
Recommended Player: Karl-Anthony Towns


# TO-DO

- duplicate values of individual players in data for last season?
- Total-Zeile rausschmeißen?
- nur effiziente, alternativer pick
- welcher am besten mit anderen 4 Spielern passt —> machen evtl mehr punkte, assists als typischer spieler auf der position (10 most similar players) —> brauchen als neuen spieler jmd der mehr assists macht
- Salary zu NBA Daten mergen
- Fragen für Jonathan 
- more complex neighbor model
- how to treat players with no data in last season
- models vorher berechnen & nur most similar player rausgeben?
- visualization of distance of single player?

In [None]:
stats_lastSeason.columns # -> adjust data by dividing by number of games?