In [10]:
# dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sqlalchemy import create_engine, inspect, func, distinct
import sqlite3
from sqlite3 import Error

In [11]:
engine = create_engine("sqlite:///db/NBA_Data.sqlite")
conn = engine.connect()

engine.table_names()

['NBA_Fantasy_Draft',
 'Team_Locations',
 'Team_Schedule',
 'season_2016_2017',
 'season_2017_2018']

In [12]:
# this function calculates the z-scores for 9-Categories and ranks players based on average z-score
# the input df has a column TOP if top players have been predicted by the logistic model
# otherwise, the top players will be ranked by VORP
def zscore(df, sample_size=130, min_games=10):
    
    if "TOP" in df.columns:
        sample_size = df["TOP"].sum()
        top_players = df[df["TOP"]==1]
    else:
        top_players = df.sort_values("VORP", ascending = False).head(sample_size)
        top_players["TOP"] = 1
        df = pd.merge(top_players[["Player", "TOP"]], df, on="Player", how="outer").fillna(0)

    # calculate adjusted percentages
    top_players['adj_FG'] = (10*(top_players['FG'].sum() / sample_size) + top_players["FG"]) / (10*(top_players['FGA'].sum() / sample_size) + top_players["FGA"])
    top_players['adj_FT'] = (10*(top_players['FT'].sum() / sample_size) + top_players["FT"]) / (10*(top_players['FTA'].sum() / sample_size) + top_players["FTA"])
    df['adj_FG'] = (10*(top_players['FG'].sum() / sample_size) + df["FG"]) / (10*(top_players['FGA'].sum() / sample_size) + df["FGA"])
    df['adj_FT'] = (10*(top_players['FT'].sum() / sample_size) + df["FT"]) / (10*(top_players['FTA'].sum() / sample_size) + df["FTA"])

    # calculate z-scores and average z-score
    df["zFG"] = (df["adj_FG"] - top_players["adj_FG"].mean()) / top_players["adj_FG"].std()
    df["zFT"] = (df["adj_FT"] - top_players["adj_FT"].mean()) / top_players["adj_FT"].std()
    df["z3P"] = (df["3P"] - top_players["3P"].mean()) / top_players["3P"].std()
    df["zPTS"] = (df["PTS"] - top_players["PTS"].mean()) / top_players["PTS"].std()
    df["zREB"] = (df["TRB"] - top_players["TRB"].mean()) / top_players["TRB"].std()
    df["zAST"] = (df["AST"] - top_players["AST"].mean()) / top_players["AST"].std()
    df["zSTL"] = (df["STL"] - top_players["STL"].mean()) / top_players["STL"].std()
    df["zBLK"] = (df["BLK"] - top_players["BLK"].mean()) / top_players["BLK"].std()
    df["zTOV"] = (top_players["TOV"].mean() - df["TOV"]) / top_players["TOV"].std()
    df["zAVG"] = (df["zFG"] + df["zFT"] + df["z3P"] + df["zPTS"] + df["zREB"] + df["zAST"] + df["zSTL"] + df["zBLK"] + df["zTOV"]) / 9

    # rank by avg z-score
    df = df.sort_values("zAVG", ascending = False).reset_index(drop=True)
    df.index += 1
    
    # exclude players with less than a set amount of games (default 10)
    df = df[df.G >= min_games]
    
    return df;

In [13]:
# season (int): the first season is used to train the model to predict top players for the next year
# roster_size (int): number of players per team in the league
# num_teams (int): number of teams in the fantasy league
# min_games (int): minimum number of games to include player on chart
def log_regression(season, roster_size = 13, num_teams = 10, min_games = 10):
    
    
    df = pd.read_sql(f'select * from season_{season}_{season+1}', conn)
    
    #file_to_load = f"Resources/{season}_{season+1}.csv"
    #df = pd.read_csv(file_to_load)
    
    sample_size = roster_size*num_teams

    df = zscore(df, sample_size)
    
    # Assign X (data) and y (target)
    X = df.drop(["Player", "TOP", "Rookie", "Pos", "G", "VORP"], axis=1)
    y = df["TOP"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
    
    classifier = LogisticRegression()
    
    classifier.fit(X_train, y_train)
    
    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    
    predictions = classifier.predict(X_test)
    print(f"First 10 Predictions:   {predictions[:10]}")
    print(f"First 10 Actual labels: {y_test[:10].tolist()}")
    
    #next_season = f"{season+1}_{season+2}"

    #file_to_load = f"Resources/{next_season}.csv"
    #next_df = pd.read_csv(file_to_load)
    
    next_df = pd.read_sql(f'select * from season_{season+1}_{season+2}', conn)
    
    next_df = zscore(next_df, sample_size)
    X = next_df.drop(["TOP","Player", "Rookie", "Pos", "G", "VORP"], axis=1).fillna(0)
    predictions = classifier.predict(X)
    
    next_df["TOP"] = predictions
    print(f"z-score calculated with {next_df['TOP'].sum()} top players")
    next_df = zscore(next_df, sample_size)
    next_df.index.name = "Rank"
    next_df.to_csv(f"Resources/log_rank_{season+2}_{season+3}.csv", index=True)
    
    return next_df;

In [14]:
df = log_regression(2016)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Training Data Score: 0.990990990990991
Testing Data Score: 0.9642857142857143
First 10 Predictions:   [1. 0. 0. 0. 1. 0. 1. 0. 0. 1.]
First 10 Actual labels: [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0]
z-score calculated with 134.0 top players


In [15]:
df

Unnamed: 0_level_0,Player,TOP,index,Rookie,Pos,G,FG,FGA,FGP,3P,...,zFG,zFT,z3P,zPTS,zREB,zAST,zSTL,zBLK,zTOV,zAVG
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Anthony Davis,1.0,9,0,PF,75,10.4,19.5,0.534,0.7,...,1.728791,0.991207,-0.656770,2.368601,1.945985,-0.421970,1.172352,3.945797,-0.431358,1.182515
2,Stephen Curry,1.0,13,0,PG,51,8.4,16.9,0.495,4.2,...,0.537940,2.484834,2.979889,2.075166,-0.287841,1.362641,1.405778,-1.003377,-1.327513,0.914169
3,Kevin Durant,1.0,11,0,PF,68,9.3,18.0,0.516,2.5,...,1.133751,1.943925,1.213512,2.075166,0.345076,1.033897,-0.695050,2.296072,-1.327513,0.890982
4,James Harden,1.0,1,0,SG,72,9.0,20.1,0.449,3.7,...,-0.933575,2.079362,2.460367,2.765600,-0.176150,2.630655,1.872628,0.027701,-2.895784,0.870089
5,Karl-Anthony Towns,1.0,5,0,C,82,7.8,14.3,0.545,1.5,...,1.603487,1.099759,0.174467,1.194863,2.392750,-0.375007,-0.461625,1.471210,-0.095300,0.778289
6,LeBron James,1.0,0,0,PF,82,10.5,19.3,0.542,1.8,...,2.040931,-0.924674,0.486180,2.265035,1.015224,2.771545,0.938927,0.440132,-2.671745,0.706839
7,Giannis Antetokounmpo,1.0,6,0,PF,75,9.9,18.7,0.529,0.6,...,1.551048,-0.256095,-0.760674,2.161470,1.536450,0.752116,1.172352,1.471210,-1.327513,0.700040
8,DeMarcus Cousins,1.0,21,0,C,48,8.5,18.0,0.470,2.2,...,-0.146089,-0.683202,0.901798,1.868036,2.616132,1.033897,1.405778,1.883641,-3.567900,0.590232
9,Chris Paul,1.0,14,0,PG,58,6.3,13.8,0.460,2.5,...,-0.476010,1.577440,1.213512,0.728820,-0.176150,2.207983,1.639203,-1.003377,-0.431358,0.586674
10,Damian Lillard,1.0,3,0,PG,73,8.5,19.4,0.439,3.1,...,-1.201445,2.738466,1.836939,2.161470,-0.511224,1.597459,0.238651,-0.590946,-1.103474,0.573988


In [16]:
# load final df into sqlite database
df.to_sql('NBA_Fantasy_Draft', conn, if_exists='replace', index=True)