In [1]:
# Add a check if we run in google colab or locally in jupyter notebook
run_in_colab = False
if 'google.colab' in str(get_ipython()):
    run_in_colab = True
    print('Running on Colab')
else:
    print('Running locally on Jupyter')

Running on Colab


# New Section

In [2]:
# Mount drive in google colab
if run_in_colab:
    from google.colab import drive
    drive.mount('/content/drive')
else:  # Set local path 
    data_path = "/Users/elkysandor/Desktop/huji yr3/static lab/"

Mounted at /content/drive


In [3]:
if run_in_colab:
    from google.colab import files
    uploaded = files.upload()

Saving nba_data_20.csv to nba_data_20.csv
Saving nba_data_21.csv to nba_data_21.csv


In [4]:
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import plotly.graph_objects as go
import numpy as np  # a module for working with numerical array 
import pandas as pd  # a module for working with data-frames
import statsmodels.api as sm  # a module for statistical modelling (e.g. regression analysis)
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from scipy.optimize import nnls
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import datetime
import scipy
import time
import sys
import io

  import pandas.util.testing as tm


In [5]:


def date_to_datetime(df):
  df.rename(columns={'game date': 'date'},inplace=True)
  df.date = pd.to_datetime(df.date,dayfirst=True)

def design_data_frame(df):
  to_drop = ["team name","game visitor_team_score","team division", "game_id",
             "minute_in_game", "player_fouls", "game home_team_id", "game home_team_score",
             "game period","game postseason","game season","game status","game time",
             "game visitor_team_id","player height_feet","player height_inches","player team_id",
             "player height_feet","player height_inches","player team_id","player weight_pounds",
             "team id","team full_name","team city","team abbreviation","team conference","player"
             ,"game id","player_defensive_rebounds","player_offensive_rebounds","player_field_three_pointer_percentage",
             "player_field_three_pointers_attempted","player_free_throw_percentage","player_field_goals_percentage",
             "player first_name","player last_name"]
  df["name"] = df["player first_name"].astype(str) + " " + df["player last_name"].astype(str)
  df.drop(columns = to_drop,inplace=True)  

def create_time_interval(weeks):
  week1 = pd.interval_range(start=pd.Timestamp(weeks[0]),end=pd.Timestamp(weeks[1]),periods=1)
  before_all_star = pd.interval_range(start=pd.Timestamp(weeks[1]),end=pd.Timestamp(weeks[2]),freq="7D")
  all_star_week = pd.interval_range(start=pd.Timestamp(weeks[2]),end=pd.Timestamp(weeks[3]),periods=1)
  after_all_star = pd.interval_range(start=pd.Timestamp(weeks[3]),end=pd.Timestamp(weeks[4]),freq="7D")
  all_intervals = pd.IntervalIndex([interval for weeks in [week1,before_all_star,all_star_week,after_all_star] for interval in weeks])
  return all_intervals


def fantasy_weeks(df, intervals):
  df["fantasy_week"] = pd.cut(df.date,intervals)
  bucket_dict = {week:int(val+1) for val,week in enumerate(df.fantasy_week.unique())}
  df["week_bucket"] = df.fantasy_week.map(bucket_dict)
  df.dropna(subset=["fantasy_week"],inplace=True)

def calculate_data(df):
  to_filter = ["date", "player id", "fantasy_week"]
  numeric_buckets = df.loc[:,~(df.columns.isin(to_filter))]
  agg_df = numeric_buckets.groupby(by = ["week_bucket","name"]).agg(sum)
  agg_df["fg_perc"] = agg_df[["player_field_goal_attempts","player_field_goal_made"]].apply(lambda x : (x.player_field_goal_made/x.player_field_goal_attempts)*100 if x.player_field_goal_attempts != 0 else 0,axis=1)
  agg_df["ft_perc"] = agg_df[["player_free_throws_attempted","player_free_throws_made"]].apply(lambda x : (x.player_free_throws_made/x.player_free_throws_attempted)*100 if x.player_free_throws_attempted != 0 else 0,axis=1)
  # agg_df.drop(columns=["player_field_goal_attempts", "player_free_throws_attempted"],inplace=True) 
  return agg_df

def cdf_per_col(col):
  func_col = col.groupby(level=0).agg(ECDF)
  return pd.Series([func_col.loc[i](col.loc[i,:]).round(4) for i in range(1,24)])


def precentages_dat(df,cdf_df):
  for i,j in [("fg_perc","player_field_goal_attempts"),("ft_perc","player_free_throws_attempted")]:
    perc_rating = cdf_df[i] * df[j].values
    avg =(perc_rating.loc[df[j]!=0]).median()
    perc_rating.loc[df[j]==0] = avg
    col_name = '_'.join(j.split("_")[1:3]) + "_rating"
    df[col_name] = perc_rating
  return df

def normalized_df(df,drop_normalized=None):
  to_normalize = df.groupby(level=0).max()
  df.turnover = to_normalize.turnover-df.turnover
  normalized = df.div(to_normalize,level = 0)
  normalized = normalized*100
  wnated_norm = normalized.drop(columns = drop_normalized)
  return wnated_norm

def by_time_rating(df,period : str):
  df_mean = df.mean(axis=1)
  if period == "W" or period == "w":
    final = df_mean.groupby(level=[0,1]).agg(np.mean).sort_values(ascending=False)
  elif  period == "Y" or period == "y":  
    final = df_mean.groupby(level=1).agg(np.mean).sort_values(ascending=False)
  else :
    print("please choose y for year or w for week")
    raise ValueError()   
  return final


def pipeline(df,time ="y"):
  agg_stat = calculate_data(df)
  perc_cdf = agg_stat[["fg_perc","ft_perc"]].apply(cdf_per_col,axis=0)
  perc_cdf_expand = perc_cdf.explode(list(perc_cdf.columns)).set_index(agg_stat.index.get_level_values(1),append=True)
  new_idx = [(i+1,name)for i,name in perc_cdf_expand.index]
  perc_cdf_expand.index = pd.MultiIndex.from_tuples(new_idx)
  final_agg = precentages_dat(agg_stat,perc_cdf_expand)
  norm_final_agg = normalized_df(final_agg, ["fg_perc","player_field_goal_attempts","ft_perc","player_free_throws_attempted","player_field_goal_made","player_free_throws_made"])
  final_rating = by_time_rating(norm_final_agg,time)
  return norm_final_agg , final_rating


def all_player_stats(df):
  part_a = df.groupby(level=1).agg(np.mean)
  part_b = df[['field_goal_rating','free_throws_rating']].groupby(level=1).agg(np.mean)
  player_stats = pd.concat([part_a,part_b],axis=1)
  return (player_stats / player_stats.max(axis=0)) * 100

def create_spider_plot(df,name):
  categories = (df.loc[name].index).str.replace("player_", "")
  fig = go.Scatterpolar(name=name,r=df.loc[name],theta=categories)
  player_mean = df.loc[name].mean()
  # fig.update_layout(title=name + f"<br><sup>{round(player_mean,2)} </sup> ")
  # fig.layout.update(polar=dict(radialaxis=dict(visible=True,range=[0, 1])))
  return fig

In [6]:
df_2021_nba = pd.read_csv(io.BytesIO(uploaded['nba_data_21.csv']),  encoding = 'iso-8859-8')
df_2020_nba = pd.read_csv(io.BytesIO(uploaded['nba_data_20.csv']),  encoding = 'iso-8859-8')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
key_dates_2021 = ["2021-10-18", "2021-10-24", "2022-02-13", "2022-02-27", "2022-04-03"]
# key_dates_2020 = ["2020-12-21", "2020-12-27", "2021-02-28", "2021-03-14", "2021-05-09"]
date_to_datetime(df_2021_nba)
# date_to_datetime(df_2020_nba)
weeks_2021 = create_time_interval(key_dates_2021) # dates for 2021 - 23 weeks
# weeks_2020 = create_time_interval(key_dates_2020) # dates for 2020 - 19 weeks
fantasy_weeks(df_2021_nba, weeks_2021)
# fantasy_weeks(df_2020_nba, weeks_2020)

In [8]:
guards_2021 = df_2021_nba[(df_2021_nba["player position"] == "G") | (df_2021_nba["player position"] == "F-G") | (df_2021_nba["player position"] == "G-F") & (df_2021_nba["player position"].notnull())]
forwards_2021 = df_2021_nba[(df_2021_nba["player position"] != "G") & (df_2021_nba["player position"] != "C") & (df_2021_nba["player position"].notnull())]
centers_2021 = df_2021_nba[(df_2021_nba["player position"] == "C") | (df_2021_nba["player position"] == "F-C") | (df_2021_nba["player position"] == "C-F") & (df_2021_nba["player position"].notnull())]
design_data_frame(df_2021_nba)
design_data_frame(guards_2021)
design_data_frame(forwards_2021)
design_data_frame(centers_2021)
# design_data_frame(df_2020_nba)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
norm_final_rating_2021 , final_rating_2021 = pipeline(df_2021_nba)
norm_final_rating_g_2021 , final_rating_g_2021 = pipeline(guards_2021)
norm_final_rating_f_2021 , final_rating_f_2021 = pipeline(forwards_2021)
norm_final_rating_c_2021 , final_rating_c_2021 = pipeline(centers_2021)
# norm_final_rating_2020 , final_rating_2020 = pipeline(df_2020_nba)

In [30]:
# norm_final_rating_2021.groupby(level=1).agg(np.mean)
season_final_df = all_player_stats(norm_final_rating_2021)
top_10 = season_final_df.mean(axis=1).nlargest(10)
sub_plots(top_10,2,5,season_final_df)

In [None]:
# season_final_df.loc["Jayson Tatum"].index

In [14]:
def sub_plots(df,nrow,ncol,full_df):
  fig = make_subplots(rows=nrow, cols=ncol,specs=[[{'type': 'polar'}]*ncol]*nrow,vertical_spacing = 0.2,shared_xaxes="all")
  count = 0
  for i in range(1,ncol+1):
    for j in range(1,nrow+1):
      fig.add_trace(create_spider_plot(full_df,df.index[count]),row=j,col=i)
      count+=1
  fig.update_traces(fill='toself')    
  fig.show()

In [29]:
fig = px.line_polar(all_player_stats(norm_final_rating_2021).loc["LeBron James"].reset_index(),r='LeBron James',theta='index',range_r=[0,100], line_close=True)
a = all_player_stats(norm_final_rating_2021).loc["LeBron James"].mean()
fig.update_layout(title = "LeBron James" + f"<br><sup>rating : {round(a,2)} </sup> ")
fig.update_traces(fill='toself')
fig.show()


In [None]:
# (norm_final_rating_2021.loc[1,"Devin Booker"].reset_index()).index())

single player simulation

In [23]:
agg_stat = calculate_data(df_2021_nba)
agg_stat = agg_stat.drop(columns = ["player_field_goal_made", "player_free_throws_made", "player_free_throws_attempted", "player_field_goal_attempts"])
agg_stat.reset_index(level='name', inplace=True)
cartesian_product_game_sim = agg_stat.assign(key=1).merge(agg_stat.assign(key=1), how='left', on = ['key', 'week_bucket'])
cartesian_product_game_sim = cartesian_product_game_sim.loc[cartesian_product_game_sim["name_x"] != cartesian_product_game_sim["name_y"]]

In [25]:
def individual_fantasy(df):
  df["assists_x_won"] = df[["player_assists_x","player_assists_y"]].apply(lambda x : 1 if x.player_assists_x > x.player_assists_y else 0 if x.player_assists_x < x.player_assists_y else 0.5,axis=1)
  df["blocks_x_won"] = df[["player_blocks_x","player_blocks_y"]].apply(lambda x : 1 if x.player_blocks_x > x.player_blocks_y else 0 if x.player_blocks_x < x.player_blocks_y else 0.5,axis=1)
  df["three_points_x_won"] = df[["player_field_three_pointers_made_x","player_field_three_pointers_made_y"]].apply(lambda x : 1 if x.player_field_three_pointers_made_x > x.player_field_three_pointers_made_y else 0 if x.player_field_three_pointers_made_x < x.player_field_three_pointers_made_y else 0.5,axis=1)
  df["points_x_won"] = df[["player_points_x","player_points_y"]].apply(lambda x : 1 if x.player_points_x > x.player_points_y else 0 if x.player_points_x < x.player_points_y else 0.5,axis=1)
  df["rebounds_x_won"] = df[["player_rebounds_x","player_rebounds_y"]].apply(lambda x : 1 if x.player_rebounds_x > x.player_rebounds_y else 0 if x.player_rebounds_x < x.player_rebounds_y else 0.5,axis=1)
  df["steals_x_won"] = df[["player_steals_x","player_steals_y"]].apply(lambda x : 1 if x.player_steals_x > x.player_steals_y else 0 if x.player_steals_x < x.player_steals_y else 0.5,axis=1)
  df["turnover_x_won"] = df[["turnover_x","turnover_y"]].apply(lambda x : 1 if x.turnover_x < x.turnover_y else 0 if x.turnover_x > x.turnover_y else 0.5,axis=1)
  df["fg_perc_x_won"] = df[["fg_perc_x","fg_perc_y"]].apply(lambda x : 1 if x.fg_perc_x > x.fg_perc_y else 0 if x.fg_perc_x < x.fg_perc_y else 0.5,axis=1)
  df["ft_x_won"] = df[["ft_perc_x","ft_perc_y"]].apply(lambda x : 1 if x.ft_perc_x > x.ft_perc_y else 0 if x.ft_perc_x < x.ft_perc_y else 0.5,axis=1)
  return df

In [26]:
individual_sim = individual_fantasy(cartesian_product_game_sim)

In [27]:
individual_sim = individual_sim.get(['name_x','assists_x_won','blocks_x_won', 'three_points_x_won','points_x_won','rebounds_x_won','steals_x_won','turnover_x_won','fg_perc_x_won','ft_x_won'])
individual_sim.set_index('name_x',inplace=True)
individual_sim = individual_sim.sum(axis=1)
individual_fantasy_final = individual_sim.groupby(level =0).agg(np.sum).sort_values(ascending=False)
individual_fantasy_final.nlargest(20)

name_x
Nikola Jokic             99095.5
Miles Bridges            98131.5
Karl-Anthony Towns       97668.0
Giannis Antetokounmpo    96966.5
Desmond Bane             96964.5
Jayson Tatum             96688.0
Mikal Bridges            96301.5
Tyrese Haliburton        95964.5
LaMelo Ball              95409.5
Joel Embiid              94577.0
Dejounte Murray          94350.0
Jaren Jackson Jr.        94246.5
LeBron James             93556.5
Nikola Vucevic           93503.0
Franz Wagner             93203.0
Jonas Valanciunas        92654.0
Anthony Edwards          92500.0
Derrick White            92439.5
Terry Rozier             92417.0
Dorian Finney-Smith      92359.5
dtype: float64

In [28]:
ecdf_df = agg_stat.apply(cdf_per_col,axis=0)
ecdf_nba = ecdf_df.explode(list(ecdf_df.columns)).set_index(agg_stat.index.get_level_values(1),append=True)
ecdf_nba = ecdf_nba.astype(float)
# ecdf_nba["player_free_throws_made_weight"] = pd.cut(ecdf_nba.player_free_throws_made,4)
# ecdf_nba["free_throw"] = 
groupby_ft = (ecdf_nba.player_free_throws_made.groupby(level=0)).quantile([0.25,.5,.75,1])
# md = pd.qcut(groupby_ft,4,labels=[0,0.3,0.65,1]).astype(float)
# fg = pd.qcut(groupby_ft,4,labels=[0,0.3,0.65,1]).astype(int)
# ecdf_nba["free_throw"] = ecdf_nba.ft_perc*0.7 + md*0.3
# ecdf_nba["field_goal"] = ecdf_nba.fg_perc*0.7 +fg*0.3
groupby_ft

IndexingError: ignored

In [None]:
ecdf_nba.astype(np.float)

In [None]:
ecdf = ECDF([1, 2, 3, 4])
ecdf([1,2,3,4])

In [None]:
import plotly
print(plotly.__version__)