In [8]:
# dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sqlalchemy import create_engine, inspect, func, distinct
import sqlite3
from sqlite3 import Error

In [9]:
# this function calculates the z-scores for 9-Categories and ranks players based on average z-score
# the input df has a column TOP if top players have been predicted by the logistic model
# otherwise, the top players will be ranked by VORP
def zscore(df, sample_size=130, min_games=10):
    
    if "TOP" in df.columns:
        sample_size = df["TOP"].sum()
        top_players = df[df["TOP"]==1]
    else:
        top_players = df.sort_values("VORP", ascending = False).head(sample_size)
        top_players["TOP"] = 1
        df = pd.merge(top_players[["Player", "TOP"]], df, on="Player", how="outer").fillna(0)

    # calculate adjusted percentages
    top_players['adj_FG'] = (10*(top_players['FG'].sum() / sample_size) + top_players["FG"]) / (10*(top_players['FGA'].sum() / sample_size) + top_players["FGA"])
    top_players['adj_FT'] = (10*(top_players['FT'].sum() / sample_size) + top_players["FT"]) / (10*(top_players['FTA'].sum() / sample_size) + top_players["FTA"])
    df['adj_FG'] = (10*(top_players['FG'].sum() / sample_size) + df["FG"]) / (10*(top_players['FGA'].sum() / sample_size) + df["FGA"])
    df['adj_FT'] = (10*(top_players['FT'].sum() / sample_size) + df["FT"]) / (10*(top_players['FTA'].sum() / sample_size) + df["FTA"])

    # calculate z-scores and average z-score
    df["zFG"] = (df["adj_FG"] - top_players["adj_FG"].mean()) / top_players["adj_FG"].std()
    df["zFT"] = (df["adj_FT"] - top_players["adj_FT"].mean()) / top_players["adj_FT"].std()
    df["z3P"] = (df["3P"] - top_players["3P"].mean()) / top_players["3P"].std()
    df["zPTS"] = (df["PTS"] - top_players["PTS"].mean()) / top_players["PTS"].std()
    df["zREB"] = (df["TRB"] - top_players["TRB"].mean()) / top_players["TRB"].std()
    df["zAST"] = (df["AST"] - top_players["AST"].mean()) / top_players["AST"].std()
    df["zSTL"] = (df["STL"] - top_players["STL"].mean()) / top_players["STL"].std()
    df["zBLK"] = (df["BLK"] - top_players["BLK"].mean()) / top_players["BLK"].std()
    df["zTOV"] = (top_players["TOV"].mean() - df["TOV"]) / top_players["TOV"].std()
    df["zAVG"] = (df["zFG"] + df["zFT"] + df["z3P"] + df["zPTS"] + df["zREB"] + df["zAST"] + df["zSTL"] + df["zBLK"] + df["zTOV"]) / 9

    # rank by avg z-score
    df = df.sort_values("zAVG", ascending = False).reset_index(drop=True)
    df.index += 1
    
    # exclude players with less than a set amount of games (default 10)
    df = df[df.G >= min_games]
    
    return df;

In [10]:
# season (int): the first season is used to train the model to predict top players for the next year
# roster_size (int): number of players per team in the league
# num_teams (int): number of teams in the fantasy league
# min_games (int): minimum number of games to include player on chart
def log_regression(season, roster_size = 13, num_teams = 10, min_games = 10):
    file_to_load = f"Resources/{season}_{season+1}.csv"
    df = pd.read_csv(file_to_load)
    
    sample_size = roster_size*num_teams

    df = zscore(df, sample_size)
    
    # Assign X (data) and y (target)
    X = df.drop(["Player", "TOP", "Rookie", "Pos", "G", "VORP"], axis=1)
    y = df["TOP"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
    
    classifier = LogisticRegression()
    
    classifier.fit(X_train, y_train)
    
    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    
    predictions = classifier.predict(X_test)
    print(f"First 10 Predictions:   {predictions[:10]}")
    print(f"First 10 Actual labels: {y_test[:10].tolist()}")
    
    next_season = f"{season+1}_{season+2}"

    file_to_load = f"Resources/{next_season}.csv"
    next_df = pd.read_csv(file_to_load)
    next_df = zscore(next_df, sample_size)
    X = next_df.drop(["TOP","Player", "Rookie", "Pos", "G", "VORP"], axis=1).fillna(0)
    predictions = classifier.predict(X)
    
    next_df["TOP"] = predictions
    print(f"z-score calculated with {next_df['TOP'].sum()} top players")
    next_df = zscore(next_df, sample_size)
    next_df.index.name = "Rank"
    next_df.to_csv(f"Resources/log_rank_{season+2}_{season+3}.csv", index=True)
    
    return next_df;

In [11]:
df = log_regression(2016)

Training Data Score: 0.924924924924925
Testing Data Score: 0.8571428571428571
First 10 Predictions:   [1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
First 10 Actual labels: [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0]
z-score calculated with 135.0 top players


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [12]:
engine = create_engine("sqlite:///db/NBA_Data.sqlite")
conn = engine.connect()

engine.table_names()

['NBA_Fantasy_Draft', 'Team_Locations', 'Team_Schedule', 'ranked_2016_2017']

In [13]:
df

Unnamed: 0_level_0,Player,TOP,Rookie,Pos,G,FG,FGA,FGP,3P,FT,...,zFG,zFT,z3P,zPTS,zREB,zAST,zSTL,zBLK,zTOV,zAVG
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Anthony Davis,1.0,0,PF,75,10.4,19.5,0.534,0.7,6.6,...,1.820339,0.852195,-0.723882,2.377167,1.908915,-0.499300,1.081903,3.692706,-0.394284,1.123973
2,Stephen Curry,1.0,0,PG,51,8.4,16.9,0.495,4.2,5.5,...,0.609262,2.366854,2.965809,2.073149,-0.298081,1.294404,1.317479,-0.975405,-1.336379,0.890788
3,Kevin Durant,1.0,0,PF,68,9.3,18.0,0.516,2.5,5.3,...,1.214337,1.828760,1.173673,2.073149,0.327235,0.963985,-0.802702,2.136669,-1.336379,0.842081
4,James Harden,1.0,0,SG,72,9.0,20.1,0.449,3.7,8.7,...,-0.865352,1.910227,2.438710,2.788485,-0.187731,2.568879,1.788630,-0.002882,-2.985044,0.828214
5,Karl-Anthony Towns,1.0,0,C,82,7.8,14.3,0.545,1.5,4.2,...,1.674235,1.003822,0.119476,1.161096,2.350314,-0.452097,-0.567127,1.358651,-0.040999,0.734152
6,LeBron James,1.0,0,PF,82,10.5,19.3,0.542,1.8,4.7,...,2.134680,-1.033864,0.435735,2.269866,0.989334,2.710487,0.846327,0.386128,-2.749521,0.665464
7,Giannis Antetokounmpo,1.0,0,PF,75,9.9,18.7,0.529,0.6,6.5,...,1.638007,-0.396109,-0.829302,2.162566,1.504299,0.680769,1.081903,1.358651,-1.336379,0.651601
8,Chris Paul,1.0,0,PG,58,6.3,13.8,0.460,2.5,3.5,...,-0.424076,1.495919,1.173673,0.678244,-0.187731,2.144054,1.553055,-0.975405,-0.394284,0.562605
9,Damian Lillard,1.0,0,PG,73,8.5,19.4,0.439,3.1,6.8,...,-1.137876,2.599171,1.806192,2.162566,-0.518780,1.530418,0.139600,-0.586395,-1.100855,0.543782
10,Jimmy Butler,1.0,0,SG,59,7.4,15.6,0.474,1.2,6.2,...,-0.014328,1.459529,-0.196784,1.322046,-0.224514,0.727971,2.259782,-0.586395,0.076763,0.536008


In [14]:
# load final df into sqlite database
df.to_sql('NBA_Fantasy_Draft', conn, if_exists='replace', index=True)