In [9]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import Team
from basketball_reference_web_scraper.data import OutputType
import pandas as pd
import datetime as datetime
from datetime import datetime, timedelta
from dateutil.parser import parse
from pulp import LpVariable, LpProblem, LpMaximize
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import Pool, CatBoostRegressor
import xgboost as xgb
import os.path
from os import path
import pytz
import seaborn as sns
from matplotlib import pyplot
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests
import joblib
import tempfile
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

CURRENT_YEAR = 2021

months_and_years = [(4, 2021), (3, 2021), (2, 2021), (1, 2021), (12, 2020), (8, 2020), (11, 2019), (12, 2019), (1, 2020), (2, 2020), (3, 2020), (7, 2020), (11, 2018), (12, 2018), (1, 2019), (2, 2019), (3, 2019), (4, 2019), (11, 2017), (12, 2017), (1, 2018), (2, 2018), (3, 2018), (4, 2018), (11, 2016), (12, 2016), (1, 2017), (2, 2017), (3, 2017), (4, 2017)]
input_statistics = ["name", "team", "date", "location", "opponent", "made_field_goals", "made_two_point_field_goals", "attempted_two_point_field_goals", "attempted_field_goals", "made_three_point_field_goals", "attempted_three_point_field_goals", "attempted_free_throws", "made_free_throws", "offensive_rebounds", "defensive_rebounds", "assists", "blocks", "turnovers", "steals", "seconds_played", "Opponent Defensive Rating", "Opponent Turnover %", 'Team Defensive Rating', 'Team Pace', 'Team Turnover %', 'Opponent Pace']
cols_to_average = ["seconds_played", "made_field_goals", "attempted_field_goals", "made_three_point_field_goals", "attempted_three_point_field_goals", "attempted_free_throws", "made_free_throws", "offensive_rebounds", "defensive_rebounds", "assists", "blocks", "turnovers", "steals", "game_score", "Opponent Defensive Rating", "Opponent Turnover %", 'Team Defensive Rating', 'Team Pace', 'Team Turnover %', 'Opponent Pace', "attempted_two_point_field_goals", "made_two_point_field_goals", "is_win"]
output_statistics = ["name", "team", "date", "location", "opponent", "minutes", "made_two_point_field_goals", "made_three_point_field_goals", "made_free_throws", "rebounds", "assists", "blocks", "steals", "turnovers", "recent_average", "10_game_average", "3_game_average", "10_3_ratio", "10_3_difference", "hot", "cold", "fantasy_points"]
all_abbrv = {'ATLANTA HAWKS':'ATL', 'BOSTON CELTICS':'BOS', 'BROOKLYN NETS':'BRO', 'CHARLOTTE HORNETS':'CHA', 'CHICAGO BULLS':'CHI', 'CLEVELAND CAVALIERS':'CLE', 'DALLAS MAVERICKS':'DAL',
            'DENVER NUGGETS':'DEN', 'DETROIT PISTONS':'DET', 'GOLDEN STATE WARRIORS':'GSW', 'HOUSTON ROCKETS':'HOU', 'INDIANA PACERS':'IND', 'LOS ANGELES CLIPPERS':'LAC', 'LOS ANGELES LAKERS':'LAL',
            'MEMPHIS GRIZZLIES':'MEM', 'MIAMI HEAT':'MIA', 'MILWAUKEE BUCKS':'MIL', 'MINNESOTA TIMBERWOLVES':'MIN', 'NEW ORLEANS PELICANS':'NOP', 'NEW YORK KNICKS':'NYK', 'OKLAHOMA CITY THUNDER':'OKL', 'ORLANDO MAGIC':'ORL',
            'PHILADELPHIA 76ERS':'PHI', 'PHOENIX SUNS':'PHX', 'PORTLAND TRAIL BLAZERS':'POR', 'SACRAMENTO KINGS':'SAC', 'SAN ANTONIO SPURS':'SAS', 'TORONTO RAPTORS':'TOR', 'UTAH JAZZ':'UTA', 'WASHINGTON WIZARDS':'WAS'}
betting_dictionary = {'ATLANTA HAWKS':'Hawks', 'BOSTON CELTICS':'Celtics', 'BROOKLYN NETS':'Nets', 'CHARLOTTE HORNETS':'Hornets', 'CHICAGO BULLS':'Bulls', 'CLEVELAND CAVALIERS':'Cavaliers', 'DALLAS MAVERICKS':'Mavericks',
            'DENVER NUGGETS':'Nuggets', 'DETROIT PISTONS':'Pistons', 'GOLDEN STATE WARRIORS':'Warriors', 'HOUSTON ROCKETS':'Rockets', 'INDIANA PACERS':'Pacers', 'LOS ANGELES CLIPPERS':'Clippers', 'LOS ANGELES LAKERS':'Lakers',
            'MEMPHIS GRIZZLIES':'Grizzlies', 'MIAMI HEAT':'Heat', 'MILWAUKEE BUCKS':'Bucks', 'MINNESOTA TIMBERWOLVES':'Timberwolves', 'NEW ORLEANS PELICANS':'Pelicans', 'NEW YORK KNICKS':'Knicks', 'OKLAHOMA CITY THUNDER':'Thunder', 'ORLANDO MAGIC':'Magic',
            'PHILADELPHIA 76ERS':'Seventysixers', 'PHOENIX SUNS':'Suns', 'PORTLAND TRAIL BLAZERS':'Trailblazers', 'SACRAMENTO KINGS':'Kings', 'SAN ANTONIO SPURS':'Spurs', 'TORONTO RAPTORS':'Raptors', 'UTAH JAZZ':'Jazz', 'WASHINGTON WIZARDS':'Wizards'}

In [10]:
def scrape_bbref_data():
    """Scrapes data from basketball-reference for months and years in above months_and_years.
    Currently scrapes from November 2016 to April 2021."""
    for m, y in months_and_years: #for current month, scrape up to today's date
        if m == datetime.today().month and y == CURRENT_YEAR:
            for d in range(1, datetime.today().day):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m == 2 and y % 4 != 0:
            for d in range(1, 29):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m == 2 and y % 4 == 0:
            for d in range(1, 30):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m in [4, 9, 11]:
            for d in range(1, 31):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        else:
            for d in range(1, 32):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
scrape_bbref_data()

In [88]:
def scrape_team_box_scores():
    """Scrapes team box scores from basketball-reference for months and years in above months_and_years.
    Currently scrapes from November 2016 to April 2021."""
    for m, y in months_and_years: #for current month, scrape up to today's date
        if m == datetime.today().month and y == CURRENT_YEAR:
            for d in range(1, datetime.today().day):
                file_name = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.team_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m == 2 and y % 4 != 0:
            for d in range(1, 29):
                file_name = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.team_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m == 2 and y % 4 == 0:
            for d in range(1, 30):
                file_name = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.team_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        elif m in [4, 9, 11]:
            for d in range(1, 31):
                file_name = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.team_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
        else:
            for d in range(1, 32):
                file_name = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
                if path.exists(file_name):
                    continue
                client.team_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)

    def add_to_table(table, y, m, d):
        file_name_to_add = "./TeamBoxScores/{0}_{1}_{2}_team_box_scores.csv".format(m, d, y)
        temp = pd.read_csv(file_name_to_add)
        days = [d] * len(temp)
        months = [m] * len(temp)
        years = [y] * len(temp)
        temp["Day"] = days
        temp["Month"] = months
        temp["Year"] = years
        table = table.append(temp)
        return table
    
    teambox_columns = ["team", "minutes_played", "made_field_goals", "attempted_field_goals", 
               "made_three_point_field_goals", "attempted_three_point_field_goals", "made_free_throws",
              "attempted_free_throws", "offensive_rebounds", "defensive_rebounds", "assists",
              "steals", "blocks", "turnovers", "personal_fouls", "Date"]
    
    all_box_scores = pd.DataFrame(columns = teambox_columns)
    for m, y in months_and_years:
        if m == datetime.today().month and y == CURRENT_YEAR:
            for d in range(1, datetime.today().day):
                all_box_scores = add_to_table(all_box_scores, y = y, m = m, d = d)
        elif m == 2:
            if y % 4 == 0:
                for d in range(1, 30):
                    all_box_scores = add_to_table(all_box_scores, y = y, m = m, d = d)
            else:
                for d in range(1, 29):
                    all_box_scores = add_to_table(all_box_scores, y = y, m = m, d = d)

        elif m in [4, 9, 11]:
            for d in range(1, 31):
                all_box_scores = add_to_table(all_box_scores, y = y, m = m, d = d)
        
        else:
            for d in range(1, 32):
                all_box_scores = add_to_table(all_box_scores, y = y, m = m, d = d)
                
    all_box_scores.reset_index()
    all_box_scores.to_csv("./TeamBoxScores/all_box_scores.csv")

scrape_team_box_scores()

In [12]:
def load_bbref_data():
    """Loads data scraped above for months_and_years and returns it all in one table."""
    all_tables = []
    for m, y in months_and_years:
        if m == datetime.today().month and y == CURRENT_YEAR:
            for d in range(1, datetime.today().day):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                table  = pd.read_csv(file_name)
                date = datetime(y, m, d)
                dates = [date] * len(table)
                table["Date"] = dates
                all_tables.append(table)
        elif m == 2:
            if y % 4 == 0:
                for d in range(1, 30):
                    file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                    table  = pd.read_csv(file_name)
                    date = datetime(y, m, d)
                    dates = [date] * len(table)
                    table["Date"] = dates
                    all_tables.append(table)
            else:
                for d in range(1, 29):
                    file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                    table  = pd.read_csv(file_name)
                    date = datetime(y, m, d)
                    dates = [date] * len(table)
                    table["Date"] = dates
                    all_tables.append(table)

        elif m in [4, 9, 11]:
            for d in range(1, 31):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                table  = pd.read_csv(file_name)
                date = datetime(y, m, d)
                dates = [date] * len(table)
                table["Date"] = dates
                all_tables.append(table)
        
        else:
            for d in range(1, 32):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                table  = pd.read_csv(file_name)
                date = datetime(y, m, d)
                dates = [date] * len(table)
                table["Date"] = dates
                all_tables.append(table)
    return all_tables

In [13]:
def write_bbref_data():
    """Calls load_bbref_data() and writes it to all_games.csv."""
    all_tables = load_bbref_data()
    full_df = all_tables[0]
    for i in range(1, len(all_tables)):
        current_table = all_tables[i]
        full_df = full_df.append(current_table)
    full_df.to_csv("./OutputCSVs/all_games.csv")

write_bbref_data()

In [14]:
def scrape_defensive_ratings():
    res = requests.get('https://hashtagbasketball.com/nba-defense-vs-position')
    soup = BeautifulSoup(res.text, 'lxml')
    pg, sg, sf, pf, c = [], [], [], [], []
    table = soup.find('table', attrs={'id':'ContentPlaceHolder1_GridView1'})
    for tr in table.find_all('tr'):
        td_list = tr.find_all('td')
        if tr.find('td') is not None:
            text = " ".join([i for i in td_list[1].text.split() if not i.isdigit()])
            if td_list[0].text == 'PG':
                pg.append(text)
            elif td_list[0].text == 'SG':
                sg.append(text)
            elif td_list[0].text == 'SF':
                sf.append(text)
            elif td_list[0].text == 'PF':
                pf.append(text)
            elif td_list[0].text == 'C':
                c.append(text)
    rank = list(range(1, 31))
    columns = ['Team', 'vs PG', 'vs SG', 'vs SF', 'vs PF', 'vs C']
    df_pg = pd.DataFrame({'Team':pg, 'vs PG':rank}).sort_values('Team')
    df_sg = pd.DataFrame({'Team':sg, 'vs SG':rank}).sort_values('Team')
    df_sf = pd.DataFrame({'Team':sf, 'vs SF':rank}).sort_values('Team')
    df_pf = pd.DataFrame({'Team':pf, 'vs PF':rank}).sort_values('Team')
    df_c = pd.DataFrame({'Team':c, 'vs C':rank}).sort_values('Team')
    a = pd.merge(df_pg, df_sg, on='Team')
    b = pd.merge(a, df_sf, on='Team')
    c = pd.merge(b, df_pf, on='Team')
    d = pd.merge(c, df_c, on='Team')
    df = d.reindex(columns=columns)
    fileoutput = "./OutputCSVs/team_def_vs_position_" + str(CURRENT_YEAR) + ".csv"
    df.to_csv(fileoutput, header=True, index=False)
    
scrape_defensive_ratings()

In [15]:
def attach_b2b_indicators():
    # Rewrites all_games.csv by attaching indicators for whether a played game is the second game of a 
    # back-to-back double header or not by retrieving schedule data from basketball_reference_web_scraper.
    
    all_games = pd.read_csv("./OutputCSVs/all_games.csv").reset_index()
    schedule = pd.DataFrame(client.season_schedule(season_end_year = 2017)) 
    for year in range(2018,2022):
        schedule = schedule.append(pd.DataFrame(client.season_schedule(season_end_year = year)))
    schedule["start_time"] = schedule["start_time"] + timedelta(hours = -7)
    schedule["Date"] = schedule["start_time"].apply(lambda x: x.strftime("%Y-%m-%d"))
    schedule["away_team"] = schedule["away_team"].apply(lambda x: x.value)
    schedule["home_team"] = schedule["home_team"].apply(lambda x: x.value)
    schedule["b2b_away"], schedule["b2b_home"] = [False] * len(schedule), [False] * len(schedule)
    all_games["away_team"], all_games["home_team"] = [""] * len(all_games), [""] * len(all_games)

    for team in all_abbrv:
        team_schedule = schedule.loc[(schedule["away_team"] == team) | (schedule["home_team"] == team)].reset_index()
        schedule.loc[team_schedule.loc[0, "index"],"b2b_home"] = False
        schedule.loc[team_schedule.loc[0, "index"],"b2b_away"] = False
        for i in range(len(team_schedule)-1):
            time = team_schedule.loc[i, "start_time"] + timedelta(hours = 24)
            if time.strftime("%Y-%m-%d") == team_schedule.loc[i+1, "Date"]:
                if team_schedule.loc[i+1, "home_team"] == team:
                    schedule.loc[team_schedule.loc[i+1, "index"], "b2b_home"] = True
                else: 
                    schedule.loc[team_schedule.loc[i+1, "index"], "b2b_away"] = True

    for i, row in all_games.iterrows():
        if row["location"] == "HOME": 
            all_games.at[i,"home_team"], all_games.at[i,"away_team"] = row["team"], row["opponent"]
        else: 
            all_games.at[i,"home_team"], all_games.at[i,"away_team"] = row["opponent"], row["team"]

    merged_df = all_games.merge(schedule, how="inner", on=["away_team","home_team","Date"])
    merged_df["b2b_indicator"] = [False] * len(merged_df)
    for i, row in merged_df.iterrows():
        if row["location"] == "HOME": 
            merged_df.at[i,"b2b_indicator"] = row["b2b_home"]
        else: 
            merged_df.at[i,"b2b_indicator"] = row["b2b_away"]
        
    merged_df.drop(merged_df.columns[range(24,31)], axis=1, inplace=True)
    merged_df = merged_df.sort_values("index").reset_index().iloc[:,3:]
    merged_df.to_csv("./OutputCSVs/all_games.csv")

attach_b2b_indicators()

In [16]:
def attach_team_stats():
    # This line gets the advanced stats from updated_team_stats.csv. This is something we scraped from 
    # NBA Advanced Stats and periodically updated. If we could streamline this process and replace it
    # with a function call that would be a better process.

    df = pd.read_csv("./OutputCSVs/updated_team_stats.csv")


    df["team"] = df["team"].str.upper()

    # This is the file we just wrote with all the scraped games.

    all_games = pd.read_csv("./OutputCSVs/all_games.csv")

    team_def = []
    team_pace = []
    team_tov = []
    opp_def = []
    opp_pace = []
    opp_tov = []

    # Here we attach team stats and opponent stats to each row of all_games.

    all_games_teams = all_games[["team", "opponent"]]

    for i in range(len(all_games_teams)):
        game = all_games_teams.loc[i]
        team = game["team"]
        opponent = game["opponent"]
        team_def.append(df[df["team"] == team]["drtg"].iloc[0])
        team_pace.append(df[df["team"] == team]["pace"].iloc[0])
        team_tov.append(df[df["team"] == team]["tov%"].iloc[0])
        opp_def.append(df[df["team"] == opponent]["drtg"].iloc[0])
        opp_pace.append(df[df["team"] == opponent]["pace"].iloc[0])
        opp_tov.append(df[df["team"] == opponent]["tov%"].iloc[0])
    
    all_games["Team Defensive Rating"] = team_def
    all_games["Team Pace"] = team_pace
    all_games["Team Turnover %"] = team_tov
    all_games["Opponent Defensive Rating"] = opp_def
    all_games["Opponent Pace"] = opp_pace
    all_games["Opponent Turnover %"] = opp_tov

    # I don't really know why we write all_games to file and then immediately
    # read the file into df. This could be an artifact of putting this together over
    # a couple months.

    df = all_games

    # Here we do a little processing because the basketball-reference data doesn't have certain stats.
    # If we could get NBA Advanced Stats game logs this is probably an avoidable step. Several points in
    # this process involve working around limitations in the bbref data that it would be nice to be
    # able to streamline away, especially if the backend is going to be public.

    attempted_2s = df["attempted_field_goals"] - df["attempted_three_point_field_goals"]
    made_2s = df["made_field_goals"] - df["made_three_point_field_goals"]
    rebounds = df["offensive_rebounds"] + df["defensive_rebounds"]
    at_home = df["location"] == "HOME"
    df["attempted_two_point_field_goals"] = attempted_2s
    df["made_two_point_field_goals"] = made_2s
    df["total_rebounds"] = rebounds
    df["at_home"] = at_home

    # And now we write this to file.

    df.to_csv("./OutputCSVs/all_games_updated.csv")

attach_team_stats()

# Here we read it again and save this data locally. We end up accessing this file a couple lines
# into get_stats right below but I don't really like how we just have this sitting here. This would
# be something we could write into a function or find a way to deal with more cleanly. Ideally
# everything in this notebook should probably either be happening inside a function or be a function
# call.

all_games_actual = pd.read_csv("./OutputCSVs/all_games_updated.csv")
#all_games_actual = all_games_actual.iloc[:, 3:].reset_index()

In [17]:
def get_stats(player, date, number_rows, start_date = False):
    """Gets the last number_rows statlines from all_games_actual for a player up to the given date. This
    gets called in generate_input_vector when we want to get a number of rows for this player to generate the
    weighted average statlines. Returns a pandas DataFrame of the desired statlines.
    
    Params:
    player: A string corresponding to entries in the 'name' column in all_games_actual. Ex: 'LeBron James'
    date: A string YYYY-MM-DD. We parse this into a datetime object to compare it to the datetimes in
        all_games_actual.
    number_rows: An integer number of rows to return.
    start_date: A string YYYY-MM-DD to start collecting data after, 
    
    TODO:
        -Probably address the assumption that all_games_actual is just sitting there to be accessed? Just smells
        kind of ugly to me. Can probably become a function call.
    """
    converted_datetime = datetime.strptime(date, '%Y-%m-%d')
    player_rows = all_games_actual.loc[all_games_actual['name'] == player]
    #print(player_rows)
    selected_rows = []
    if start_date:
        for i in range(len(player_rows)):
            this_date = datetime.strptime(player_rows.iloc[i]['Date'], '%Y-%m-%d')
            if this_date <= converted_datetime and this_date >= datetime.strptime(start_date, '%Y-%m-%d'):
                selected_rows.append(player_rows.iloc[i])
        return pd.DataFrame(selected_rows).sort_values(by=['Date'], ascending = False)
    if (len(player_rows)) < number_rows:
        for i in range(len(player_rows)):
            selected_rows.append(player_rows.iloc[i])
        return pd.DataFrame(selected_rows)
    index = 0
    for i in range(len(player_rows)):
        curr_date = player_rows.iloc[i]['Date']
        if datetime.strptime(curr_date, '%Y-%m-%d') >= converted_datetime:
            index = i
            break
    if index != 0:
        if index + 1 - number_rows < 0:
            for i in range(index + 1):
                selected_rows.append(player_rows.iloc[i])
        else:
            for i in range(index + 1 - number_rows, index + 1):
                selected_rows.append(player_rows.iloc[i])
    else:
        for i in range(len(player_rows) - number_rows, len(player_rows)):
            selected_rows.append(player_rows.iloc[len(player_rows) - i - 1])
    return pd.DataFrame(selected_rows).sort_values(by=['Date'], ascending = False)

In [18]:
def double_double(threes, twos, fts, rebounds, assists):
    """Returns whether the input values for a given statline constitute a double_double.
    Inputs should already be floats but have caused trouble in unexpected spots.
    
    TODO:
        -Investigate whether the .tolist() calls are significantly wasting our time and if they
        can be removed. This function is only really called at the end of the pipeline but this could
        be a time waster."""
    points = float(threes) * 3 + float(twos) * 2 + float(fts)
    rebounds = rebounds.tolist()[0]
    assists = assists.tolist()[0]
    return (points >= 10 and rebounds >= 10) or (points >= 10 and assists >= 10) or (rebounds >= 10 and assists >= 10)

def triple_double(threes, twos, fts, rebounds, assists):
    """See above documentation for double_double. Returns whether statline constitutes a triple_double.
    Similar concerns regarding input types."""
    points = float(threes) * 3 + float(twos) * 2 + float(fts)
    rebounds = rebounds.tolist()[0]
    assists = assists.tolist()[0]
    return points >= 10 and rebounds >= 10 and assists >= 10

def get_points(row_data):
    """Returns a tuple of Fanduel points and equivalent Fanduel dollar value for a given statline.
    This gets called when we want to return a number of points for a statline.
    
    Params:
    row_data: One row of a pandas DataFrame. May or may not have the columns attribute, which we look for
        just in case.
    """
    if 'made_three_point_field_goals_y' in row_data.columns:
        three_pt_fgs = row_data['made_three_point_field_goals_y']
        two_pt_fgs = row_data['made_two_point_field_goals_y']
        made_fts = row_data['made_free_throws_y']
        total_rebounds = row_data['rebounds_y']
        assists = row_data['assists_y']
        blocks = row_data['blocks_y']
        steals = row_data['steals_y']
        turnovers = row_data['turnovers_y']
    else:
        three_pt_fgs = row_data['made_three_point_field_goals']
        two_pt_fgs = row_data['made_two_point_field_goals']
        made_fts = row_data['made_free_throws']
        total_rebounds = row_data['rebounds']
        assists = row_data['assists']
        blocks = row_data['blocks']
        steals = row_data['steals']
        turnovers = row_data['turnovers']
    FD_points = three_pt_fgs * 3 + two_pt_fgs * 2 + made_fts + total_rebounds * 1.2 + assists * 1.5 + blocks * 3 + steals * 3 - turnovers
    FD_dollars = FD_points * 200
    return (FD_points, FD_dollars)

def get_draftkings_points(row_data):
    """See above documentation for get_points. Returns a tuple of Draftkings points and dollar value based on DK
    scoring rules and relative values. Coerces some values to floats to be able to ensure double_double and triple_double
    functions work smoothly.
    
    Params:
    row_data: One row of a pandas DataFrame. May or may not have the columns attribute, which we look for
        just in case.
    """
    if 'made_three_point_field_goals_y' in row_data.columns:
        three_pt_fgs = float(row_data['made_three_point_field_goals_y'])
        two_pt_fgs = float(row_data['made_two_point_field_goals_y'])
        made_fts = float(row_data['made_free_throws_y'])
        total_rebounds = float(row_data['rebounds_y'])
        assists = float(row_data['assists_y'])
        blocks = row_data['blocks_y']
        steals = row_data['steals_y']
        turnovers = row_data['turnovers_y']
    else:
        three_pt_fgs = float(row_data['made_three_point_field_goals'])
        two_pt_fgs = float(row_data['made_two_point_field_goals'])
        made_fts = float(row_data['made_free_throws'])
        total_rebounds = row_data['rebounds']
        assists = row_data['assists']
        blocks = row_data['blocks']
        steals = row_data['steals']
        turnovers = row_data['turnovers']
    DK_points = three_pt_fgs * 3.5 + two_pt_fgs * 2 + made_fts + total_rebounds * 1.25 + assists * 1.5 + blocks * 2 + steals * 2 - .5 * turnovers + 1.5 * double_double(three_pt_fgs, two_pt_fgs, made_fts, total_rebounds, assists) + 3 * triple_double(three_pt_fgs, two_pt_fgs, made_fts, total_rebounds, assists)
    DK_dollars = DK_points * 187.5
    return (DK_points, DK_dollars)

In [19]:
def weight_function(statistic, weight):
    """Takes in an array of a statistic and produces a weighted sum according to the weight.
    Honestly I am not very sure how this function works. I wrote it up at the same time as time_weighted_average
    and remember it being a fairly elegant solution to the problem of producing a weighted average with a bias towards
    recent games but it is not really obvious to me why this works.
    
    Params:
    statistic: An array or number corresponding to one statistic.
    weight: A weight in which to bias the average towards more recent terms. A higher weight will result in an average
        more faithful to recent games. A statistic that occurred 'i' games ago will be multiplied by weight ** i, 
        summed to the running total before the whole thing is divided by the sum of weight ** n, where n is the number of rows
        we are constructing the weighted average from. 
    
    TODO:
        -Make this a more coherent comment I guess?
    """
    s = 0
    if type(statistic) == np.ndarray:
        for i in range(len(statistic)):
            s += statistic[len(statistic) - i - 1] * (weight ** i)
    else:
        for i in range(len(statistic)):
            s += statistic.iloc[len(statistic) - i - 1,] * (weight ** i)
    return s

In [20]:
def time_weighted_average(rows, statistic, weight):
    """Takes in a set of rows, a given statistic to make the weighted average for, and a weight to build this average with.
    See documentation for weight_function above.
    
    Params:
    rows: The rows we are given to make a weighted average for.
    statistic: The statistic to create the weighted average for.
    weight: The weight we will use to make this average.
    """
    if rows.empty:
        return 0
    stat = rows[statistic]
    this_num = 1 / weight_function(np.ones(len(stat)), weight)
    return this_num * weight_function(stat, weight)

In [102]:
def generate_input_vector(player_box_scores, input_statistics, sample_size = 5, weight = .8, per_min = False):
    """Takes in box scores, an array of input statistics, a sample size of games, and a weight.
    Produces, for each player in player_box_scores, a weighted average of each statistic in input_statistics
    over the preceding sample_size games for that player with the specified weight. So if sample_size is 5, 
    the first 5 statlines in generate_input_vector for each player will have incomplete versions of the weighted average.
    Because time_weighted_average adjusts for this there won't be any wacky numbers produced for those rows,
    but they will be very biased towards the most recent games in the sample as they aren't producing weighted averages
    over the full desired length of the sample window.
    
    This function generates the inputs for our models. Because of the above weirdness it is good to get very large sample
    sizes here so the first few rows don't end up biasing the models too much. I am pretty sure the object it returns is
    a pandas DataFrame.
    
    Params:
    player_box_scores: A pandas DataFrame of game statlines for each player. It has both the bbref box score stats
        and the NBA Advanced Team stats we helpfully tacked on earlier.
    input_statistics: An array of strings representing the columns of player_box_scores we want to create weighted
        averages of.
    sample_size: The number of rows we will use to create weighted averages. This is an input to time_weighted_average.
    weight: The weight we will use to create weighted averages. This is an input to time_weighted_average.
    
    TODO:
        -Maybe turn this into per-minute weighted averages? Not sure where in the pipeline we will want to implement
        per-minute numbers. It's possible we might want to stick with per-game weighted averages to produce per-
        minute predictions.
    """
    if per_min:
        return generate_input_vector_per_min(player_box_scores, input_statistics, sample_size, weight)
    player_box_scores = player_box_scores[~player_box_scores.index.duplicated()]
    player_box_scores.reindex(range(len(player_box_scores)), axis = "index")
    predicted_statlines = pd.DataFrame(index = player_box_scores.index, columns = input_statistics).fillna(0).T
    index_len = len(player_box_scores.index)
    for box_index in player_box_scores.index:
        box_score = player_box_scores.loc[box_index]
        player_name = box_score["name"]
        game_date = str(box_score["Date"])[:10]
        last_n_rows = get_stats(player_name, game_date, sample_size)
        weighted_stats = [player_name, box_score["team"], game_date, box_score["location"], box_score["opponent"]]
        for stat in input_statistics[5:]:
            weighted_stats.append(round(time_weighted_average(last_n_rows, stat, weight), 2))
        predicted_statlines[box_index] = weighted_stats
    inputs = predicted_statlines.T
    print(inputs.shape)
    inputs = add_team_defense(inputs)
    print(inputs.shape)
    inputs = add_over_under(inputs)
    print(inputs.shape)
    inputs = add_isolation_offense(inputs)
    print(inputs.shape)
    inputs = add_isolation_defense(inputs)
    print(inputs.shape)
    inputs = add_team_isolation_offense(inputs)
    print(inputs.shape)
    inputs = add_rate_statistics(inputs)
    print(inputs.shape)
    return inputs

def generate_input_vector_per_min(player_box_scores, input_statistics, sample_size = 5, weight = .8):
    """Takes in box scores, an array of input statistics, a sample size of games, and a weight.
    Produces, for each player in player_box_scores, a weighted average of each statistic in input_statistics
    over the preceding sample_size games for that player with the specified weight. So if sample_size is 5, 
    the first 5 statlines in generate_input_vector for each player will have incomplete versions of the weighted average.
    Because time_weighted_average adjusts for this there won't be any wacky numbers produced for those rows,
    but they will be very biased towards the most recent games in the sample as they aren't producing weighted averages
    over the full desired length of the sample window.
    
    This function generates the inputs for our models. Because of the above weirdness it is good to get very large sample
    sizes here so the first few rows don't end up biasing the models too much. I am pretty sure the object it returns is
    a pandas DataFrame.
    
    Params:
    player_box_scores: A pandas DataFrame of game statlines for each player. It has both the bbref box score stats
        and the NBA Advanced Team stats we helpfully tacked on earlier.
    input_statistics: An array of strings representing the columns of player_box_scores we want to create weighted
        averages of.
    sample_size: The number of rows we will use to create weighted averages. This is an input to time_weighted_average.
    weight: The weight we will use to create weighted averages. This is an input to time_weighted_average.
    
    TODO:
        -Maybe turn this into per-minute weighted averages? Not sure where in the pipeline we will want to implement
        per-minute numbers. It's possible we might want to stick with per-game weighted averages to produce per-
        minute predictions.
    """
    player_box_scores = player_box_scores[~player_box_scores.index.duplicated()]
    player_box_scores.reindex(range(len(player_box_scores)), axis = "index")
    predicted_statlines = pd.DataFrame(index = player_box_scores.index, columns = input_statistics).fillna(0).T
    index_len = len(player_box_scores.index)
    for box_index in player_box_scores.index:
        box_score = player_box_scores.loc[box_index]
        player_name = box_score["name"]
        game_date = str(box_score["Date"])[:10]
        last_n_rows = get_stats(player_name, game_date, sample_size)
        weighted_stats = [player_name, box_score["team"], game_date, box_score["location"], box_score["opponent"]]
        recent_minutes = time_weighted_average(last_n_rows, "seconds_played", weight)/60
        mins = player_box_scores.loc[box_index, "seconds_played"]
        for i in range(5, len(input_statistics)):
            stat = input_statistics[i]
            if recent_minutes > 0 and i < (len(input_statistics) - 6):
                weighted_stats.append(round(time_weighted_average(last_n_rows, stat, weight)/recent_minutes, 2))
            else:
                weighted_stats.append(round(time_weighted_average(last_n_rows, stat, weight), 2))
        predicted_statlines[box_index] = weighted_stats
    inputs = predicted_statlines.T
    print(inputs.shape)
    inputs = add_team_defense(inputs)
    print(inputs.shape)
    inputs = add_over_under(inputs)
    print(inputs.shape)
    inputs = add_isolation_offense(inputs)
    print(inputs.shape)
    inputs = add_isolation_defense(inputs)
    print(inputs.shape)
    inputs = add_team_isolation_offense(inputs)
    print(inputs.shape)
    inputs = add_rate_statistics(inputs)
    print(inputs.shape)
    return inputs
#THIS ONE IS THE PER MINUTE ONE

In [95]:
def n_game_average(player, date, sample_size, start_date = False):
    """Takes in a player, date, sample size of games, and optional date to start collecting from.
    Calls get_stats to return the player's statlines in this range and returns the average statline
    for the player in each category over the desired range of games.
    
    Params:
    player: A string corresponding to entries in the 'name' column in all_games_actual. Ex: 'LeBron James'
    date: A string YYYY-MM-DD. We parse this into a datetime object to compare it to the datetimes in
        all_games_actual.
    sample_size: An integer number of rows to return.
    start_date: A string YYYY-MM-DD to start collecting data after. 
    """
    season_stats = get_stats(player, date, sample_size, start_date)
    season_stats["is_win"] = season_stats["outcome"] == "WIN"
    cols_to_note = list(season_stats.loc[season_stats.index[0], ["name", "team"]])
    averages = [np.mean(season_stats[col]) for col in cols_to_average]
    return cols_to_note + averages

def season_average(player, year):
    """Takes in a player and year and returns the average statline for the player in each category
    for games in the season ending that year.
    
    Params:
    player: A string corresponding to entries in the 'name' column in all_games_actual. Ex: 'LeBron James'
    year: A number YYYY.
    """
    start_date = str(year - 1) + "-10-15"
    if year == 2020:
        end_date = str(year) + "-08-16"
    else:
        end_date = str(year) + "-04-17"
    return n_game_average(player, end_date, 82, start_date)

def generate_input_matrix(player, date, sample_size):
    """Takes in a player, date and sample size of games. Returns a matrix to be used for model input,
    with rows for each of the following inputs:
        -The player's season average statline across each category.
        -The player's previous average statline across each category.
        -The player's 15-game average statline across each category.
        -The player's 10-game average statline across each category.
        -The player's 5-game average statline across each category.
        -The player's game log in each category in the last sample_size games.
    
    Params:
    player: A string corresponding to entries in the 'name' column in all_games_actual. Ex: 'LeBron James'
    date: A string YYYY-MM-DD. We parse this into a datetime object to compare it to the datetimes in
        all_games_actual.
    sample_size: An integer number of rows to return.
    """
    date = str(date)[:10]
    last_n_rows = get_stats(player, date, sample_size)
    last_n_rows["is_win"] = last_n_rows["outcome"] == "WIN"
    last_n_rows["date"] = last_n_rows["Date"].astype(str)
    last_n_rows = add_team_defense(last_n_rows)
    last_n_rows = add_over_under(last_n_rows)
    last_n_rows = add_isolation_offense(last_n_rows)
    last_n_rows = add_isolation_defense(last_n_rows)
    last_n_rows = add_team_isolation_offense(last_n_rows)
    last_n_rows = add_rate_statistics(last_n_rows)
    this_season_average = season_average(player, int(date[:4]))
    last_season_average = season_average(player, int(date[:4]) - 1)
    l15_average = n_game_average(player, date, 15)
    l10_average = n_game_average(player, date, 10)
    l5_average = n_game_average(player, date, 5)
    for average in [this_season_average, last_season_average, l15_average, l10_average, l5_average]:
        count = 0
        array_to_append = []
        for c in last_n_rows.columns:
            if c in ["name", "team"] or c in cols_to_average:
                array_to_append.append(average[count])
                count = count + 1
            else:
                array_to_append.append(0)
        array_to_append = pd.Series(array_to_append, index = last_n_rows.columns)
        last_n_rows = last_n_rows.append(array_to_append, ignore_index = True)
    last_n_rows = last_n_rows.drop(columns = ["Unnamed: 0", "Unnamed: 0.1"])
    print(last_n_rows.shape)
    print(last_n_rows.columns)
    return last_n_rows

generate_input_matrix("Landry Shamet", datetime(2021, 4, 20), 10)

(15, 53)
Index(['slug', 'name', 'team', 'location', 'opponent', 'outcome',
       'seconds_played', 'made_field_goals', 'attempted_field_goals',
       'made_three_point_field_goals', 'attempted_three_point_field_goals',
       'made_free_throws', 'attempted_free_throws', 'offensive_rebounds',
       'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers',
       'personal_fouls', 'game_score', 'Date', 'b2b_indicator',
       'Team Defensive Rating', 'Team Pace', 'Team Turnover %',
       'Opponent Defensive Rating', 'Opponent Pace', 'Opponent Turnover %',
       'attempted_two_point_field_goals', 'made_two_point_field_goals',
       'total_rebounds', 'at_home', 'is_win', 'date',
       'Opponent Defensive Rank vs Position', 'total', 'Iso POSS', 'Iso PPP',
       'Iso FGA', 'Opp D Iso POSS', 'Opp D Iso PPP', 'Opp D Iso FGA',
       'Opp D Iso Score %', 'Team Iso POSS', 'Team Iso PPP', 'Team Iso FGA',
       'Team Iso Score %', 'Usage Rate', 'OReb %', 'DReb %', 'Reb %', 'PIE'],

Unnamed: 0,slug,name,team,location,opponent,outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,game_score,Date,b2b_indicator,Team Defensive Rating,Team Pace,Team Turnover %,Opponent Defensive Rating,Opponent Pace,Opponent Turnover %,attempted_two_point_field_goals,made_two_point_field_goals,total_rebounds,at_home,is_win,date,Opponent Defensive Rank vs Position,total,Iso POSS,Iso PPP,Iso FGA,Opp D Iso POSS,Opp D Iso PPP,Opp D Iso FGA,Opp D Iso Score %,Team Iso POSS,Team Iso PPP,Team Iso FGA,Team Iso Score %,Usage Rate,OReb %,DReb %,Reb %,PIE
0,shamela01,Landry Shamet,BROOKLYN NETS,HOME,CHARLOTTE HORNETS,WIN,1278.0,7.0,14.0,6.0,11.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,1,12.3,2021-04-16,1,108.3,101.51,15.1,112.8,96.24,15.0,3.0,1.0,0,1,1.0,2021-04-16,15,233.59,0,0,0,5.6,0.86,4.3,0.0,10.0,1.17,8.1,0.0,32.77,0.0,0.0,0.0,0.08
1,shamela01,Landry Shamet,BROOKLYN NETS,AWAY,PHILADELPHIA 76ERS,LOSS,2174.0,4.0,10.0,1.0,5.0,8.0,9.0,0.0,1.0,3.0,0.0,0.0,2.0,1,11.2,2021-04-14,1,108.3,101.51,15.1,107.6,99.38,14.2,5.0,3.0,1,0,0.0,2021-04-14,11,233.59,0,0,0,6.4,0.94,5.1,0.0,10.0,1.17,8.1,0.0,19.01,0.0,3.68,1.77,0.09
2,shamela01,Landry Shamet,BROOKLYN NETS,AWAY,MINNESOTA TIMBERWOLVES,WIN,1548.0,7.0,12.0,5.0,9.0,0.0,0.0,0.0,5.0,5.0,1.0,0.0,1.0,4,16.8,2021-04-13,0,108.3,101.51,15.1,111.6,103.94,14.6,3.0,2.0,5,0,1.0,2021-04-13,27,233.59,0,0,0,6.1,0.89,4.7,0.0,10.0,1.17,8.1,0.0,20.23,0.0,16.32,9.21,0.16
3,shamela01,Landry Shamet,BROOKLYN NETS,HOME,LOS ANGELES LAKERS,LOSS,1397.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,2.0,3.0,0.0,1.0,0,2.9,2021-04-10,1,108.3,101.51,15.1,105.5,101.11,14.9,2.0,0.0,1,1,0.0,2021-04-10,9,233.59,0,0,0,6.9,0.85,5.2,0.0,10.0,1.17,8.1,0.0,12.86,0.0,4.58,2.32,0.03
4,shamela01,Landry Shamet,BROOKLYN NETS,AWAY,CHICAGO BULLS,LOSS,1312.0,4.0,10.0,3.0,6.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1,8.9,2021-04-04,0,108.3,101.51,15.1,108.9,100.46,15.3,4.0,1.0,2,0,0.0,2021-04-04,18,233.59,0,0,0,6.4,0.82,5.1,0.0,10.0,1.17,8.1,0.0,23.63,5.78,5.93,5.85,0.08
5,shamela01,Landry Shamet,BROOKLYN NETS,HOME,CHARLOTTE HORNETS,WIN,1275.0,6.0,10.0,5.0,7.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,2,10.3,2021-04-01,1,108.3,101.51,15.1,112.8,96.24,15.0,3.0,1.0,1,1,1.0,2021-04-01,15,233.59,0,0,0,5.6,0.86,4.3,0.0,10.0,1.17,8.1,0.0,27.25,0.0,4.18,2.28,0.09
6,shamela01,Landry Shamet,BROOKLYN NETS,HOME,DETROIT PISTONS,WIN,1337.0,5.0,6.0,3.0,3.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0,12.4,2021-03-13,0,108.3,101.51,15.1,112.3,97.86,15.5,3.0,2.0,0,1,1.0,2021-03-13,17,233.59,0,0,0,6.9,0.96,5.1,0.0,10.0,1.17,8.1,0.0,15.51,0.0,0.0,0.0,0.12
7,shamela01,Landry Shamet,BROOKLYN NETS,HOME,BOSTON CELTICS,WIN,1922.0,6.0,12.0,6.0,9.0,0.0,0.0,1.0,3.0,2.0,0.0,0.0,0.0,3,13.8,2021-03-11,0,108.3,101.51,15.1,106.2,99.78,13.6,3.0,0.0,4,1,1.0,2021-03-11,10,233.59,0,0,0,7.0,0.89,5.4,0.0,10.0,1.17,8.1,0.0,16.1,3.84,9.99,7.14,0.14
8,shamela01,Landry Shamet,BROOKLYN NETS,AWAY,HOUSTON ROCKETS,WIN,1768.0,4.0,9.0,3.0,7.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,4,6.3,2021-03-03,1,108.3,101.51,15.1,109.9,103.62,14.1,2.0,1.0,3,0,1.0,2021-03-03,30,233.59,0,0,0,8.4,0.83,6.6,0.0,10.0,1.17,8.1,0.0,14.87,0.0,8.89,5.43,0.06
9,shamela01,Landry Shamet,BROOKLYN NETS,AWAY,SAN ANTONIO SPURS,WIN,1354.0,2.0,8.0,2.0,7.0,1.0,2.0,0.0,4.0,1.0,0.0,1.0,0.0,2,3.6,2021-03-01,0,108.3,101.51,15.1,112.8,100.6,12.1,1.0,0.0,4,0,1.0,2021-03-01,4,233.59,0,0,0,7.1,0.85,5.8,0.0,10.0,1.17,8.1,0.0,18.54,0.0,19.99,9.49,0.04


In [101]:
def input_matrix_to_tensor(input_matrix):
    input_columns_to_keep = ['wl',
       'seconds_played', 'made_field_goals', 'attempted_field_goals',
       'made_three_point_field_goals', 'attempted_three_point_field_goals',
       'made_free_throws', 'attempted_free_throws', 'offensive_rebounds',
       'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers',
       'personal_fouls', 'game_score', 'b2b_indicator',
       'Team Defensive Rating', 'Team Pace', 'Team Turnover %',
       'Opponent Defensive Rating', 'Opponent Pace', 'Opponent Turnover %',
       'attempted_two_point_field_goals', 'made_two_point_field_goals',
       'total_rebounds', 'at_home', 'is_win',
       'Opponent Defensive Rank vs Position', 'total', 'Iso POSS', 'Iso PPP',
       'Iso FGA', 'Opp D Iso POSS', 'Opp D Iso PPP', 'Opp D Iso FGA',
       'Opp D Iso Score %', 'Team Iso POSS', 'Team Iso PPP', 'Team Iso FGA',
       'Team Iso Score %', 'Usage Rate', 'OReb %', 'DReb %', 'Reb %', 'PIE']
    input_matrix["winner"] = input_matrix["outcome"] == "WIN"
    input_matrix["loser"] = input_matrix["outcome"] == "LOSS"
    input_matrix["wl"] = input_matrix["winner"].astype(int) - input_matrix["loser"].astype(int)
    input_matrix = input_matrix[input_columns_to_keep]
    print(input_matrix.shape)
    input_matrix = np.array(input_matrix)
    input_tensor = tf.convert_to_tensor(input_matrix, dtype=tf.float32)
    return input_tensor

input_matrix_to_tensor(generate_input_matrix("Landry Shamet", datetime(2021, 4, 20), 12))

(17, 53)
Index(['slug', 'name', 'team', 'location', 'opponent', 'outcome',
       'seconds_played', 'made_field_goals', 'attempted_field_goals',
       'made_three_point_field_goals', 'attempted_three_point_field_goals',
       'made_free_throws', 'attempted_free_throws', 'offensive_rebounds',
       'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers',
       'personal_fouls', 'game_score', 'Date', 'b2b_indicator',
       'Team Defensive Rating', 'Team Pace', 'Team Turnover %',
       'Opponent Defensive Rating', 'Opponent Pace', 'Opponent Turnover %',
       'attempted_two_point_field_goals', 'made_two_point_field_goals',
       'total_rebounds', 'at_home', 'is_win', 'date',
       'Opponent Defensive Rank vs Position', 'total', 'Iso POSS', 'Iso PPP',
       'Iso FGA', 'Opp D Iso POSS', 'Opp D Iso PPP', 'Opp D Iso FGA',
       'Opp D Iso Score %', 'Team Iso POSS', 'Team Iso PPP', 'Team Iso FGA',
       'Team Iso Score %', 'Usage Rate', 'OReb %', 'DReb %', 'Reb %', 'PIE'],

<tf.Tensor: shape=(17, 46), dtype=float32, numpy=
array([[ 1.00000000e+00,  1.27800000e+03,  7.00000000e+00,
         1.40000000e+01,  6.00000000e+00,  1.10000000e+01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  2.00000000e+00,  1.00000000e+00,
         1.23000002e+01,  1.00000000e+00,  1.08300003e+02,
         1.01510002e+02,  1.51000004e+01,  1.12800003e+02,
         9.62399979e+01,  1.50000000e+01,  3.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         1.00000000e+00,  1.50000000e+01,  2.33589996e+02,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         5.59999990e+00,  8.60000014e-01,  4.30000019e+00,
         0.00000000e+00,  1.00000000e+01,  1.16999996e+00,
         8.10000038e+00,  0.00000000e+00,  3.27700005e+01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         7.99999982e-02],
       [-1.00000000e+00,  2.17400000e+0

In [23]:
def minutes_predictor(weighted_stats):
    """Produces the minutes projections for a set of weighted statlines. Just returns the mean weighted seconds
    in the given rows. Right now this function doesn't actually get called anywhere, but if we come up with a way to
    get better minutes projections out of a set of weighted stats we can reimplement it.
    
    Params:
    weighted_stats: A pandas DataFrame of weighted game box score stats for one player.
    """
    if "seconds_played_y" in weighted_stats.index:
        return np.mean(weighted_stats["seconds_played_y"])/60
    else:
        return np.mean(weighted_stats["seconds_played"])/60

def recent_average(weighted_stats):
    """Produces an average of recent Fanduel points for the weighted statlines. Because the statlines are already weighted
    versions of the past several games, this is not giving the actual recent average of it but a recent average
    that is weighted more closely to recent performances.
    
    Params:
    weighted_stats: A pandas DataFrame of weighted game box score stats for one player.
    """
    if len(weighted_stats.index) == 0:
        return 0
    if "made_three_point_field_goals_y" in weighted_stats.index:
        return 3*np.mean(weighted_stats["made_three_point_field_goals_y"]) + 2*np.mean(weighted_stats["made_two_point_field_goals_y"]) + np.mean(weighted_stats["made_free_throws_y"]) + 1.2*(np.mean(weighted_stats["offensive_rebounds_y"]) + np.mean(weighted_stats["defensive_rebounds_y"])) + 1.5*np.mean(weighted_stats["assists_y"]) + 3*np.mean(weighted_stats["blocks_y"]) + 3*np.mean(weighted_stats["steals_y"]) - np.mean(weighted_stats["turnovers_y"])
    else:
        return 3*np.mean(weighted_stats["made_three_point_field_goals"]) + 2*np.mean(weighted_stats["made_two_point_field_goals"]) + np.mean(weighted_stats["made_free_throws"]) + 1.2*(np.mean(weighted_stats["offensive_rebounds"]) + np.mean(weighted_stats["defensive_rebounds"])) + 1.5*np.mean(weighted_stats["assists"]) + 3*np.mean(weighted_stats["blocks"]) + 3*np.mean(weighted_stats["steals"]) - np.mean(weighted_stats["turnovers"])

In [73]:

# This is a csv of player positions. This needs to be updated every so often but
# probably does not have to be automated.

positions = pd.read_csv('./OutputCSVs/all_player_positions.csv')

# This is a csv of team defense vs position. This is something that we really should
# be automating and replacing with a function call. Especially early in the season, this
# will be changing a lot. It's also potentially something we want to combine with past season data.
# I also don't love how it's being accessed and assumed to exist outside these functions.

dbp_2017 = pd.read_csv('./OutputCSVs/team_def_vs_position_2017.csv')
dbp_2018 = pd.read_csv('./OutputCSVs/team_def_vs_position_2018.csv')
dbp_2019 = pd.read_csv('./OutputCSVs/team_def_vs_position_2019.csv')
dbp_2020 = pd.read_csv('./OutputCSVs/team_def_vs_position_2020.csv')
dbp_2021 = pd.read_csv('./OutputCSVs/team_def_vs_position_2021.csv')

def process_dual_positions(position1, position2, team, opponent, dbp):
    """Given a position for a player on a team and a position for a player on the opposing team,
    this function searches the defense vs position csv for each team's rank against the other
    position. Samay wrote this one and I don't really know all the details about how it works.
    
    Params:
    position1: A string (?) representing the position for the player on 'team'.
    position2: A string (?) representing the position for the player on 'opponent'.
    team: The team that the player in position1 plays for, corresponding to the 'Team' column in the
        defense by position csv.
    opponent: The team that the player in position2 plays for, corresponding to the 'Team' column in the
        defense by position csv.
    dbp: CSV of team defense by position.

    """
    first_team_subrank =  dbp.loc[dbp['Team']==team, 'vs {0}'.format(position1)].iloc[0]
    second_team_subrank =  dbp.loc[dbp['Team']==team, 'vs {0}'.format(position2)].iloc[0]
    first_opp_subrank =  dbp.loc[dbp['Team']==opponent, 'vs {0}'.format(position1)].iloc[0]
    second_opp_subrank =  dbp.loc[dbp['Team']==opponent, 'vs {0}'.format(position2)].iloc[0]
    return first_team_subrank, second_team_subrank, first_opp_subrank, second_opp_subrank
    
def add_team_defense(main_df):
    """Takes in a DataFrame of statlines and tacks on the opponent defense against the position
    for each player, returning an augmented DataFrame. Searches the position for each player and finds the opponent's 
    rank against the position. This function as we applied it only used team defense against position as a 
    descriptive indicator in the frontend display - it didn't actually use the defense against position as a
    predictive input. It is probably adaptable to attach defense vs position to a set of inputs earlier in
    the pipeline so as to have predictive value in training the models.
    
    Params:
    main_df: A pandas DataFrame of players with positions, teams and opponents. As currently applied, this
        is a DataFrame of created predictions, the hope is that this works with a DataFrame of model inputs so
        that the defense vs position can be an actual model input.
    """
    team_def_vs_pos = []
    opp_def_vs_pos = []

    player_positions = pd.Series(positions['position'].values,index=positions['player name']).to_dict()

    for i in range(len(main_df)):
        name = main_df['name'].iloc[i]
        date = main_df['date'].iloc[i]
        if int(date[:4]) == 2021:
            dbp = dbp_2021
        elif int(date[:4]) == 2020:
            if int(date[5:7]) > 10:
                dbp = dbp_2021
            else:
                dbp = dbp_2020
        elif int(date[:4]) == 2019:
            if int(date[5:7]) > 9:
                dbp = dbp_2020
            else:
                dbp = dbp_2019
        elif int(date[:4]) == 2018:
            if int(date[5:7]) > 9:
                dbp = dbp_2019
            else:
                dbp = dbp_2018
        elif int(date[:4]) == 2017:
            if int(date[5:7]) > 9:
                dbp = dbp_2018
            else:
                dbp = dbp_2017
        else:
            dbp = dbp_2017
        position = player_positions.get(name)
        team = all_abbrv.get(main_df['team'].iloc[i])
        opponent = all_abbrv.get(main_df['opponent'].iloc[i])
        if position is None:
            team_def_vs_pos.append(15.5)
            opp_def_vs_pos.append(15.5)
        else:
            if position in ['PG','SG','SF','PF','C']:
                team_def_vs_pos.append(dbp.loc[dbp['Team']==team, 'vs {0}'.format(position)].iloc[0])
                opp_def_vs_pos.append(dbp.loc[dbp['Team']==opponent, 'vs {0}'.format(position)].iloc[0])
            elif position == 'G':
                pdp = process_dual_positions('PG', 'SG', team, opponent, dbp)
                team_def_vs_pos.append((pdp[0] + pdp[1])/2)
                opp_def_vs_pos.append((pdp[2] + pdp[3])/2)
            elif position == 'F':
                pdp = process_dual_positions('SF', 'PF', team, opponent, dbp)
                team_def_vs_pos.append((pdp[0] + pdp[1])/2)
                opp_def_vs_pos.append((pdp[2] + pdp[3])/2)
            elif position in ['G-F','F-G']:
                pdp = process_dual_positions('SG', 'SF', team, opponent, dbp)
                team_def_vs_pos.append((pdp[0] + pdp[1])/2)
                opp_def_vs_pos.append((pdp[2] + pdp[3])/2)
            elif position in ['F-C']:
                pdp = process_dual_positions('PF', 'C', team, opponent, dbp)
                team_def_vs_pos.append((pdp[0] + pdp[1])/2)
                opp_def_vs_pos.append((pdp[2] + pdp[3])/2)
    #main_df['team def vs pos'] = team_def_vs_pos
    main_df['Opponent Defensive Rank vs Position'] = opp_def_vs_pos
    return main_df
#add_team_defense(pd.read_csv('./AllCSVs/predictions_for_07_31_2020_unplayed.csv'))

ou_2017 = pd.read_csv("./OutputCSVs/2016-17_OU.csv")
ou_2018 = pd.read_csv("./OutputCSVs/2017-18_OU.csv")
ou_2019 = pd.read_csv("./OutputCSVs/2018-19_OU.csv")
ou_2020 = pd.read_csv("./OutputCSVs/2019-20_OU.csv")
ou_2021 = pd.read_csv("./OutputCSVs/2020-21_OU.csv")
ou = ou_2017.append(ou_2018).append(ou_2019).append(ou_2020).append(ou_2021)
ou.reset_index()
ou["year"] = ou["date"]//10000
ou["strdate"] = ou["date"].astype(str)
ou = ou[~ou.strdate.str.contains("202010")]

def add_over_under(main_df):
    main_df["newdate"] = main_df["date"].str.replace("-", "")
    over_under = []
    for i in main_df.index:
        newdate = int(main_df.loc[i, "newdate"])
        newyear = newdate // 10000
        location = main_df.loc[i, "location"]
        team = betting_dictionary.get(main_df['team'][i])
        opponent = betting_dictionary.get(main_df['opponent'][i])
        dates = ou["strdate"].unique()
        if newdate in dates:
            if i % 500 == 0:
                print(newdate)
            if location == "AWAY":
                over_under.append(ou.loc[(ou["date"] == newdate) & (ou["o:team"] == opponent), "total"].values[0])
            else:
                over_under.append(ou.loc[(ou["date"] == newdate) & (ou["team"] == team), "total"].values[0])
        else:
            over_under.append(np.mean(ou.loc[(ou["team"] == team) & (ou["year"] == newyear), "total"]))
    main_df = main_df.drop(["newdate"], axis = 1)
    main_df["total"] = over_under
    return main_df

add_over_under(pd.read_csv("./AllCSVs/predictions_for_04_11_2021_unplayed.csv"))

Unnamed: 0.1,Unnamed: 0,name,team,date,location,opponent,minutes,made_two_point_field_goals,made_three_point_field_goals,made_free_throws,rebounds,assists,blocks,steals,turnovers,recent_average,10_game_average,3_game_average,10_3_ratio,10_3_difference,hot,cold,fantasy_points,Opponent Defensive Rank vs Position,projected_points,projected_points_draftkings,projected_value,projected_value_draftkings,total
0,114919,Jarnell Stokes,DENVER NUGGETS,2021-04-11,HOME,BOSTON CELTICS,6.83,0.23,0.0,0.29,0.48,0.4,0.1,0.18,0.07,6.08,5.7,5.7,1.0,0.0,0.0,0.0,0,15.5,2.7,2.48,539,464,224.75
1,114920,Quincy Acy,DALLAS MAVERICKS,2021-04-11,AWAY,SAN ANTONIO SPURS,5.38,0.11,0.1,0.1,0.44,0.21,0.08,0.06,0.03,3.43,7.0,0.4,5.714286,6.6,0.0,1.422144,0,15.5,1.85,1.8,371,338,224.95
2,114921,Archie Goodwin,NEW ORLEANS PELICANS,2021-04-11,AWAY,CLEVELAND CAVALIERS,10.65,1.24,0.32,1.08,0.96,0.7,0.1,0.15,0.14,10.61,10.52,6.5,1.536,4.02,0.0,0.0,0,15.5,7.33,7.36,1466,1380,226.645161
3,114922,John Lucas III,MINNESOTA TIMBERWOLVES,2021-04-11,AWAY,CHICAGO BULLS,1.66,0.02,0.0,0.0,0.0,0.03,0.0,0.02,0.0,1.33,1.9,3.17,0.695444,-1.27,0.0,0.0,0,15.5,0.14,0.12,29,23,226.359375
4,114923,Chris Andersen,CLEVELAND CAVALIERS,2021-04-11,AWAY,NEW ORLEANS PELICANS,14.52,0.54,0.0,0.27,1.93,0.71,0.31,0.25,0.33,7.98,8.58,7.33,1.15006,1.25,0.0,0.0,0,15.5,6.08,5.78,1216,1084,218.16129
5,114924,Nicolás Laprovíttola,SAN ANTONIO SPURS,2021-04-11,AWAY,DALLAS MAVERICKS,6.42,0.15,0.18,0.17,0.22,0.43,0.0,0.02,0.1,5.35,7.1,6.87,1.029225,0.23,0.0,0.0,0,15.5,1.88,2.01,376,377,224.982143
6,114925,Steve Novak,MILWAUKEE BUCKS,2021-04-11,AWAY,ORLANDO MAGIC,3.54,0.15,0.05,0.01,0.18,0.0,0.0,0.04,0.04,0.95,1.08,0.4,1.485714,0.68,0.0,0.0,0,15.5,0.76,0.77,151,144,230.854839
7,114926,Arinze Onuaku,ORLANDO MAGIC,2021-04-11,HOME,MILWAUKEE BUCKS,4.62,0.05,0.04,0.05,0.31,0.13,0.04,0.02,0.03,1.97,1.9,2.07,0.944625,-0.17,0.0,0.0,0,15.5,0.99,0.98,197,183,218.33871
8,114927,Alonzo Gee,DENVER NUGGETS,2021-04-11,HOME,BOSTON CELTICS,11.55,0.33,0.01,0.58,1.08,0.54,0.2,0.27,0.27,5.94,5.16,2.37,1.827893,2.79,0.0,0.0,0,15.5,4.52,4.24,903,795,224.75
9,114928,Gary Neal,ATLANTA HAWKS,2021-04-11,HOME,CHARLOTTE HORNETS,4.65,0.21,0.04,0.2,0.05,0.07,0.0,0.06,0.03,3.08,3.35,3.35,1.0,0.0,0.0,0.0,0,15.5,1.05,1.03,211,194,226.828125


In [76]:
playeriso_2017 = pd.read_csv("./IsolationStats/PlayerIsolationOffense/CSVs/2016-17 Player Isolation Offense.csv")
playeriso_2017["Year"] = [2017 for _ in range(playeriso_2017.shape[0])]
playeriso_2018 = pd.read_csv("./IsolationStats/PlayerIsolationOffense/CSVs/2017-18 Player Isolation Offense.csv")
playeriso_2018["Year"] = [2018 for _ in range(playeriso_2018.shape[0])]
playeriso_2019 = pd.read_csv("./IsolationStats/PlayerIsolationOffense/CSVs/2018-19 Player Isolation Offense.csv")
playeriso_2019["Year"] = [2019 for _ in range(playeriso_2019.shape[0])]
playeriso_2020 = pd.read_csv("./IsolationStats/PlayerIsolationOffense/CSVs/2019-20 Player Isolation Offense.csv")
playeriso_2020["Year"] = [2020 for _ in range(playeriso_2020.shape[0])]
playeriso_2021 = pd.read_csv("./IsolationStats/PlayerIsolationOffense/CSVs/2020-21 Player Isolation Offense.csv")
playeriso_2021["Year"] = [2021 for _ in range(playeriso_2021.shape[0])]

playeriso = playeriso_2017.append(playeriso_2018).append(playeriso_2019).append(playeriso_2020).append(playeriso_2021)
playeriso.reset_index()

def add_isolation_offense(main_df):
    main_df["newdate"] = main_df["date"].str.replace("-", "")
    poss = []
    ppp = []
    fga = []
    for i in main_df.index:
        newdate = int(main_df.loc[i, "newdate"])
        playername = main_df.loc[i, "name"]
        newyear = newdate // 10000
        if (newdate // 100) - (100 * newyear) > 10:
            newyear = newyear + 1
        iso_row = playeriso.loc[(playeriso["Year"] == newyear) & (playeriso["PLAYER"] == playername), ["POSS", "PPP", "FGA"]]
        if iso_row.shape[0] > 0:
            poss.append(iso_row["POSS"].values[0])
            ppp.append(iso_row["PPP"].values[0])
            fga.append(iso_row["FGA"].values[0])
        else:
            poss.append(0)
            ppp.append(0)
            fga.append(0)
    main_df = main_df.drop(["newdate"], axis = 1)
    main_df["Iso POSS"] = poss
    main_df["Iso PPP"] = ppp
    main_df["Iso FGA"] = fga
    return main_df

add_isolation_offense(pd.read_csv("./AllCSVs/predictions_for_04_01_2021_unplayed.csv"))

Unnamed: 0.1,Unnamed: 0,name,team,date,location,opponent,minutes,made_two_point_field_goals,made_three_point_field_goals,made_free_throws,rebounds,assists,blocks,steals,turnovers,recent_average,10_game_average,3_game_average,10_3_ratio,10_3_difference,hot,cold,fantasy_points,Opponent Defensive Rank vs Position,projected_points,projected_points_draftkings,projected_value,projected_value_draftkings,Iso POSS,Iso PPP,Iso FGA
0,113349,Lance Stephenson,NEW ORLEANS PELICANS,2021-04-01,HOME,ORLANDO MAGIC,22.81,2.02,0.8,0.97,2.31,1.45,0.3,0.29,0.95,10.6,12.93,4.93,2.349073,8.0,0.0,2.553753,0,,13.18,13.58,2635,2546,0.0,0.0,0.0
1,113350,Jarnell Stokes,DENVER NUGGETS,2021-04-01,HOME,LOS ANGELES CLIPPERS,5.99,0.17,0.05,0.2,0.62,0.26,0.05,0.12,0.14,6.08,5.7,5.7,1.0,0.0,0.0,0.0,0,,2.19,2.15,439,403,0.0,0.0,0.0
2,113351,Danuel House,WASHINGTON WIZARDS,2021-04-01,HOME,DETROIT PISTONS,8.77,0.75,0.19,0.47,1.46,0.82,0.15,0.26,0.33,18.85,19.94,14.5,1.350968,5.44,0.0,0.0,0,8.0,6.42,6.34,1284,1190,0.0,0.0,0.0
3,113352,Chris Andersen,CLEVELAND CAVALIERS,2021-04-01,AWAY,PHILADELPHIA 76ERS,12.58,0.56,0.0,0.16,1.69,0.38,0.34,0.24,0.42,7.98,8.58,7.33,1.15006,1.25,0.0,0.0,0,,5.2,4.91,1040,921,0.0,0.0,0.0
4,113353,Josh McRoberts,MIAMI HEAT,2021-04-01,AWAY,GOLDEN STATE WARRIORS,15.58,1.08,0.41,0.12,2.16,0.84,0.2,0.39,0.55,8.03,8.81,1.5,3.924,7.31,0.0,2.793107,0,,8.58,8.58,1716,1609,0.0,0.0,0.0
5,113354,Nicolás Laprovíttola,SAN ANTONIO SPURS,2021-04-01,AWAY,ATLANTA HAWKS,4.8,0.13,0.11,0.12,0.34,0.33,0.0,0.11,0.15,5.35,7.1,6.87,1.029225,0.23,0.0,0.0,0,,1.79,1.83,359,343,0.0,0.0,0.0
6,113355,Aaron Harrison,CHARLOTTE HORNETS,2021-04-01,HOME,BROOKLYN NETS,12.56,0.64,0.27,0.62,1.42,0.63,0.11,0.24,0.34,12.8,13.8,12.8,1.072464,1.0,0.0,0.0,0,,6.07,6.1,1214,1143,0.0,0.0,0.0
7,113356,Marcus Thornton,WASHINGTON WIZARDS,2021-04-01,AWAY,DETROIT PISTONS,20.07,1.78,1.03,0.69,2.12,1.47,0.0,0.45,0.76,12.8,14.73,8.33,1.685959,6.4,0.0,0.0,0,,12.68,13.23,2536,2481,0.0,0.0,0.0
8,113357,Arinze Onuaku,ORLANDO MAGIC,2021-04-01,HOME,NEW ORLEANS PELICANS,6.47,0.11,0.06,0.09,0.41,0.08,0.04,0.05,0.09,1.97,1.9,2.07,0.944625,-0.17,0.0,0.0,0,,1.28,1.29,256,241,0.0,0.0,0.0
9,113358,Anthony Bennett,BROOKLYN NETS,2021-04-01,AWAY,CHARLOTTE HORNETS,8.01,0.5,0.28,0.3,1.23,0.28,0.05,0.16,0.19,8.45,9.98,3.87,2.25462,6.11,0.0,0.0,0,,4.48,4.56,895,855,0.0,0.0,0.0


In [75]:
teamdiso_2017 = pd.read_csv("./IsolationStats/TeamIsolationDefense/Defense/2016-17 Team Isolation Defense.csv")
teamdiso_2017["Year"] = [2017 for _ in range(teamdiso_2017.shape[0])]
teamdiso_2018 = pd.read_csv("./IsolationStats/TeamIsolationDefense/Defense/2017-18 Team Isolation Defense.csv")
teamdiso_2018["Year"] = [2018 for _ in range(teamdiso_2018.shape[0])]
teamdiso_2019 = pd.read_csv("./IsolationStats/TeamIsolationDefense/Defense/2018-19 Team Isolation Defense.csv")
teamdiso_2019["Year"] = [2019 for _ in range(teamdiso_2019.shape[0])]
teamdiso_2020 = pd.read_csv("./IsolationStats/TeamIsolationDefense/Defense/2019-20 Team Isolation Defense.csv")
teamdiso_2020["Year"] = [2020 for _ in range(teamdiso_2020.shape[0])]
teamdiso_2021 = pd.read_csv("./IsolationStats/TeamIsolationDefense/Defense/2020-21 Team Isolation Defense.csv")
teamdiso_2021["Year"] = [2021 for _ in range(teamdiso_2021.shape[0])]

teamdiso = teamdiso_2017.append(teamdiso_2018).append(teamdiso_2019).append(teamdiso_2020).append(teamdiso_2021)
teamdiso.reset_index()

def add_isolation_defense(main_df):
    main_df["newdate"] = main_df["date"].str.replace("-", "")
    main_df["opponent"] = main_df["opponent"].str.lower()
    teamdiso["TEAM"] = teamdiso["TEAM"].str.lower()
    teamdiso["SCORE FREQ"] = teamdiso["SCORE FREQ"].astype(str)
    teamdiso["SCORE FREQ"] = teamdiso["SCORE FREQ"].str.rstrip('%').astype('float')
    teamdiso["SCORE FREQ"] = teamdiso["SCORE FREQ"] / 100
    poss = []
    ppp = []
    fga = []
    freq = []
    for i in main_df.index:
        newdate = int(main_df.loc[i, "newdate"])
        opponent = main_df.loc[i, "opponent"]
        newyear = newdate // 10000
        if (newdate // 100) - (100 * newyear) > 10:
            newyear = newyear + 1
        iso_row = teamdiso.loc[(teamdiso["Year"] == newyear) & (teamdiso["TEAM"] == opponent), ["POSS", "PPP", "FGA", "SCORE FREQ"]]
        if iso_row.shape[0] > 0:
            poss.append(iso_row["POSS"].values[0])
            ppp.append(iso_row["PPP"].values[0])
            fga.append(iso_row["FGA"].values[0])
            freq.append(iso_row["SCORE FREQ"].values[0])
        else:
            poss.append(0)
            ppp.append(0)
            fga.append(0)
            freq.append(np.mean(freq))
    main_df = main_df.drop(["newdate"], axis = 1)
    main_df["Opp D Iso POSS"] = poss
    main_df["Opp D Iso PPP"] = ppp
    main_df["Opp D Iso FGA"] = fga
    main_df["Opp D Iso Score %"] = freq
    main_df = main_df.round(2)
    main_df["opponent"] = main_df["opponent"].str.upper()
    return main_df

teamoiso_2017 = pd.read_csv("./IsolationStats/TeamIsolationOffense/Offense/2016-17 Team Isolation Offense.csv")
teamoiso_2017["Year"] = [2017 for _ in range(teamoiso_2017.shape[0])]
teamoiso_2018 = pd.read_csv("./IsolationStats/TeamIsolationOffense/Offense/2017-18 Team Isolation Offense.csv")
teamoiso_2018["Year"] = [2018 for _ in range(teamoiso_2018.shape[0])]
teamoiso_2019 = pd.read_csv("./IsolationStats/TeamIsolationOffense/Offense/2018-19 Team Isolation Offense.csv")
teamoiso_2019["Year"] = [2019 for _ in range(teamoiso_2019.shape[0])]
teamoiso_2020 = pd.read_csv("./IsolationStats/TeamIsolationOffense/Offense/2019-20 Team Isolation Offense.csv")
teamoiso_2020["Year"] = [2020 for _ in range(teamoiso_2020.shape[0])]
teamoiso_2021 = pd.read_csv("./IsolationStats/TeamIsolationOffense/Offense/2020-21 Team Isolation Offense.csv")
teamoiso_2021["Year"] = [2021 for _ in range(teamoiso_2021.shape[0])]

teamoiso = teamoiso_2017.append(teamoiso_2018).append(teamoiso_2019).append(teamoiso_2020).append(teamoiso_2021)
teamoiso.reset_index()

def add_team_isolation_offense(main_df):
    main_df["newdate"] = main_df["date"].str.replace("-", "")
    main_df["team"] = main_df["team"].str.lower()
    teamoiso["TEAM"] = teamoiso["TEAM"].str.lower()
    teamoiso["SCORE FREQ"] = teamoiso["SCORE FREQ"].astype(str)
    teamoiso["SCORE FREQ"] = teamoiso["SCORE FREQ"].str.rstrip('%').astype('float')
    teamoiso["SCORE FREQ"] = teamoiso["SCORE FREQ"] / 100
    poss = []
    ppp = []
    fga = []
    freq = []
    for i in main_df.index:
        newdate = int(main_df.loc[i, "newdate"])
        team = main_df.loc[i, "team"]
        newyear = newdate // 10000
        if (newdate // 100) - (100 * newyear) > 10:
            newyear = newyear + 1
        iso_row = teamoiso.loc[(teamoiso["Year"] == newyear) & (teamoiso["TEAM"] == team), ["POSS", "PPP", "FGA", "SCORE FREQ"]]
        if iso_row.shape[0] > 0:
            poss.append(iso_row["POSS"].values[0])
            ppp.append(iso_row["PPP"].values[0])
            fga.append(iso_row["FGA"].values[0])
            freq.append(iso_row["SCORE FREQ"].values[0])
        else:
            poss.append(0)
            ppp.append(0)
            fga.append(0)
            freq.append(np.mean(freq))
    main_df = main_df.drop(["newdate"], axis = 1)
    main_df["Team Iso POSS"] = poss
    main_df["Team Iso PPP"] = ppp
    main_df["Team Iso FGA"] = fga
    main_df["Team Iso Score %"] = freq
    main_df = main_df.round(2)
    main_df["team"] = main_df["team"].str.upper()
    return main_df

add_team_isolation_offense(add_isolation_defense(add_isolation_offense(pd.read_csv("./AllCSVs/predictions_for_04_01_2021_unplayed.csv"))))

Unnamed: 0.1,Unnamed: 0,name,team,date,location,opponent,minutes,made_two_point_field_goals,made_three_point_field_goals,made_free_throws,rebounds,assists,blocks,steals,turnovers,recent_average,10_game_average,3_game_average,10_3_ratio,10_3_difference,hot,cold,fantasy_points,Opponent Defensive Rank vs Position,projected_points,projected_points_draftkings,projected_value,projected_value_draftkings,Iso POSS,Iso PPP,Iso FGA,Opp D Iso POSS,Opp D Iso PPP,Opp D Iso FGA,Opp D Iso Score %,Team Iso POSS,Team Iso PPP,Team Iso FGA,Team Iso Score %
0,113349,Lance Stephenson,NEW ORLEANS PELICANS,2021-04-01,HOME,ORLANDO MAGIC,22.81,2.02,0.8,0.97,2.31,1.45,0.3,0.29,0.95,10.6,12.93,4.93,2.35,8.0,0.0,2.55,0,,13.18,13.58,2635,2546,0.0,0.0,0.0,5.4,1.12,4.8,0.5,6.0,1.05,4.6,0.51
1,113350,Jarnell Stokes,DENVER NUGGETS,2021-04-01,HOME,LOS ANGELES CLIPPERS,5.99,0.17,0.05,0.2,0.62,0.26,0.05,0.12,0.14,6.08,5.7,5.7,1.0,0.0,0.0,0.0,0,,2.19,2.15,439,403,0.0,0.0,0.0,0.0,0.0,0.0,0.5,6.0,0.95,5.3,0.44
2,113351,Danuel House,WASHINGTON WIZARDS,2021-04-01,HOME,DETROIT PISTONS,8.77,0.75,0.19,0.47,1.46,0.82,0.15,0.26,0.33,18.85,19.94,14.5,1.35,5.44,0.0,0.0,0,8.0,6.42,6.34,1284,1190,0.0,0.0,0.0,6.9,0.96,5.1,0.46,8.0,0.77,6.5,0.38
3,113352,Chris Andersen,CLEVELAND CAVALIERS,2021-04-01,AWAY,PHILADELPHIA 76ERS,12.58,0.56,0.0,0.16,1.69,0.38,0.34,0.24,0.42,7.98,8.58,7.33,1.15,1.25,0.0,0.0,0,,5.2,4.91,1040,921,0.0,0.0,0.0,6.4,0.94,5.1,0.45,6.0,0.66,4.8,0.31
4,113353,Josh McRoberts,MIAMI HEAT,2021-04-01,AWAY,GOLDEN STATE WARRIORS,15.58,1.08,0.41,0.12,2.16,0.84,0.2,0.39,0.55,8.03,8.81,1.5,3.92,7.31,0.0,2.79,0,,8.58,8.58,1716,1609,0.0,0.0,0.0,6.6,0.89,5.0,0.4,6.0,0.89,4.2,0.43
5,113354,Nicolás Laprovíttola,SAN ANTONIO SPURS,2021-04-01,AWAY,ATLANTA HAWKS,4.8,0.13,0.11,0.12,0.34,0.33,0.0,0.11,0.15,5.35,7.1,6.87,1.03,0.23,0.0,0.0,0,,1.79,1.83,359,343,0.0,0.0,0.0,7.3,1.08,5.8,0.5,6.0,1.04,4.8,0.51
6,113355,Aaron Harrison,CHARLOTTE HORNETS,2021-04-01,HOME,BROOKLYN NETS,12.56,0.64,0.27,0.62,1.42,0.63,0.11,0.24,0.34,12.8,13.8,12.8,1.07,1.0,0.0,0.0,0,,6.07,6.1,1214,1143,0.0,0.0,0.0,7.4,0.97,5.9,0.45,6.0,0.85,5.0,0.4
7,113356,Marcus Thornton,WASHINGTON WIZARDS,2021-04-01,AWAY,DETROIT PISTONS,20.07,1.78,1.03,0.69,2.12,1.47,0.0,0.45,0.76,12.8,14.73,8.33,1.69,6.4,0.0,0.0,0,,12.68,13.23,2536,2481,0.0,0.0,0.0,6.9,0.96,5.1,0.46,8.0,0.77,6.5,0.38
8,113357,Arinze Onuaku,ORLANDO MAGIC,2021-04-01,HOME,NEW ORLEANS PELICANS,6.47,0.11,0.06,0.09,0.41,0.08,0.04,0.05,0.09,1.97,1.9,2.07,0.94,-0.17,0.0,0.0,0,,1.28,1.29,256,241,0.0,0.0,0.0,5.6,1.05,4.7,0.47,4.0,0.92,3.3,0.44
9,113358,Anthony Bennett,BROOKLYN NETS,2021-04-01,AWAY,CHARLOTTE HORNETS,8.01,0.5,0.28,0.3,1.23,0.28,0.05,0.16,0.19,8.45,9.98,3.87,2.25,6.11,0.0,0.0,0,,4.48,4.56,895,855,0.0,0.0,0.0,5.6,0.86,4.3,0.41,10.0,1.17,8.1,0.5


In [93]:
def add_rate_statistics(main_df):
    team_box_scores = pd.read_csv("./TeamBoxScores/all_box_scores.csv")
    main_df["newdate"] = main_df["date"].str.replace("-", "").astype(int)
    main_df["year"] = main_df["newdate"] // 10000
    main_df["month"] = (main_df["newdate"] - 10000 * main_df["year"]) // 100
    main_df["day"] = main_df["newdate"] - 10000 * main_df["year"] - 100 * main_df["month"]
    usage = []
    orebrate = []
    drebrate = []
    rebrate = []
    pie = []
    for i in main_df.index:
        t = main_df.loc[i, "team"]
        o = main_df.loc[i, "opponent"]
        y = main_df.loc[i, "year"]
        m = main_df.loc[i, "month"]
        d = main_df.loc[i, "day"]
        game_row = team_box_scores.loc[(team_box_scores["Year"] == y) & 
                                       (team_box_scores["Month"] == m) & 
                                       (team_box_scores["Day"] == d) & 
                                       (team_box_scores["team"] == t), 
                                       ["minutes_played", "made_field_goals", "attempted_field_goals", "made_three_point_field_goals",
                                       "attempted_three_point_field_goals", "made_free_throws", "attempted_free_throws",
                                       "offensive_rebounds", "defensive_rebounds", "assists", "steals", "blocks",
                                       "turnovers", "personal_fouls"]]
        opponent_row = team_box_scores.loc[(team_box_scores["Year"] == y) & 
                                       (team_box_scores["Month"] == m) & 
                                       (team_box_scores["Day"] == d) & 
                                       (team_box_scores["team"] == o), 
                                       ["offensive_rebounds", "defensive_rebounds"]]
        
        if len(game_row["minutes_played"].values) > 0:
            usage.append((100 * ((main_df.loc[i, "attempted_two_point_field_goals"] + main_df.loc[i, "attempted_three_point_field_goals"]) +
                        .44 * main_df.loc[i, "attempted_free_throws"] + 
                             main_df.loc[i, "turnovers"]) * 
                      game_row["minutes_played"].values[0]) /
                    ((game_row["attempted_field_goals"].values[0] + .44 * game_row["attempted_free_throws"].values[0] + game_row["turnovers"].values[0]) *
                     (main_df.loc[i, "seconds_played"] / 12)))
            orebrate.append((20 * (main_df.loc[i, "offensive_rebounds"]) * game_row["minutes_played"].values[0]) /
                       ((main_df.loc[i, "seconds_played"] / 60) * (game_row["offensive_rebounds"].values[0] + opponent_row["defensive_rebounds"].values[0])))
            drebrate.append((20 * (main_df.loc[i, "defensive_rebounds"]) * game_row["minutes_played"].values[0]) /
                       ((main_df.loc[i, "seconds_played"] / 60) * (opponent_row["offensive_rebounds"].values[0] + game_row["defensive_rebounds"].values[0])))
            rebrate.append((20 * (main_df.loc[i, "offensive_rebounds"] + main_df.loc[i, "defensive_rebounds"]) * game_row["minutes_played"].values[0]) /
                       ((main_df.loc[i, "seconds_played"] / 60) * (opponent_row["offensive_rebounds"].values[0] + opponent_row["defensive_rebounds"].values[0] + game_row["offensive_rebounds"].values[0] + game_row["defensive_rebounds"].values[0])))
            pie.append((3 * main_df.loc[i, "made_two_point_field_goals"] + 4 * main_df.loc[i, "made_three_point_field_goals"] + 2 * main_df.loc[i, "made_free_throws"] - 
                    main_df.loc[i, "attempted_two_point_field_goals"] - main_df.loc[i, "attempted_three_point_field_goals"] - main_df.loc[i, "attempted_free_throws"] +
                    main_df.loc[i, "defensive_rebounds"] + .5 * main_df.loc[i, "offensive_rebounds"] + main_df.loc[i, "assists"] +
                    main_df.loc[i, "steals"] + .5 * main_df.loc[i, "blocks"] - main_df.loc[i, "turnovers"]) /
                  (3 * (game_row["made_field_goals"].values[0] - game_row["made_three_point_field_goals"].values[0]) + 4 * game_row["made_three_point_field_goals"].values[0] + 2 * game_row["made_free_throws"].values[0] -
                  game_row["attempted_field_goals"].values[0] - game_row["attempted_free_throws"].values[0] + game_row["defensive_rebounds"].values[0] + .5 * game_row["offensive_rebounds"].values[0] +
                  game_row["assists"].values[0] + game_row["steals"].values[0] + .5 * game_row["blocks"].values[0] - game_row["turnovers"].values[0]))
        else:
            usage.append(0)
            orebrate.append(0)
            drebrate.append(0)
            rebrate.append(0)
            pie.append(0)
        
    main_df = main_df.drop(["newdate", "year", "month", "day"], axis = 1)
    main_df["Usage Rate"] = usage
    main_df["OReb %"] = orebrate
    main_df["DReb %"] = drebrate
    main_df["Reb %"] = rebrate
    main_df["PIE"] = pie
    main_df = main_df.round(2)
    return main_df

In [44]:
def statline_output(player_box_scores, input_statistics, per_min = False, retrain = True, pretrain_inputs = False):
    """Here is the meat of the modeling process. Takes in player_box_scores, a set of box scores from the
    desired start date of the data until the game date, and produces several versions of weighted statlines
    using generate_input_vector and consisting of the statistics in input_statistics. We then keep the weighted statlines
    with over 800 seconds played (potential yikes here) and use these as model inputs in concurrent models
    for each of the statistics in output_statistics. Each of the output_statistics has a corresponding model
    and best set of weighted lines which was determined through testing. We keep the data for model inputs, train
    the models by merging the weighted lines with the actual lines and predicting each actual statistic with the weighted
    lines, and then make an empty DataFrame that we fill with predictions using these models. This DataFrame is processed
    and returned.
    
    Params:
    player_box_scores: Box scores over the desired range of days
    input_statistics: Array of statistics to be used as model inputs.
    
    TODO:
        -Make this predict FPPM. Generally make the prediction process more intelligent.
    """
    input_indices = [3, 7, 6, 9, 10, 12, 11, 13, 14, 15, 18, 16, 17, 22, 23, 24, 20, 25, 21, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
    output_indices = {"seconds": 51,
                      "threepoints": 54,
                      "freethrows": 56,
                      "assists": 60,
                      "steals": 61,
                      "blocks": 62,
                      "turnovers": 63,
                      "twopoints": 74,
                      "rebounds":75}
    
    if not pretrain_inputs:
        input_indices = [i + 1 for i in input_indices]
        for stat in output_indices.keys():
            output_indices[stat] = output_indices[stat] + 1
        
    
    if per_min:
        for box_index in player_box_scores.index:
            mins = player_box_scores.loc[box_index, "seconds_played"]
            for j in player_box_scores.columns[6:20]:
                if mins > 0:
                    player_box_scores.loc[box_index, j] = player_box_scores.loc[box_index, j]*60/player_box_scores.loc[box_index, "seconds_played"]
    
    # Creating the four sets of weighted trailing statlines, using sample sizes of 7 and 8
    # and weights of .8, .85 and .9.
    
    
    filename_685 = "./pretrained/685.csv"
    filename_985 = "./pretrained/985.csv"
    filename_1085 = "./pretrained/1085.csv"
    
    if path.exists(filename_685):
        if not pretrain_inputs:
            weighted_lines_6_85 = pd.read_csv(filename_685)
            print("a")
        else:
            weighted_lines_6_85 = generate_input_vector(player_box_scores, input_statistics, 6, .85, per_min)
            weighted_lines_6_85.to_csv(filename_685)
            print("b")
    else:
        weighted_lines_6_85 = generate_input_vector(player_box_scores, input_statistics, 6, .85, per_min)
        weighted_lines_6_85.to_csv(filename_685)
        print("c")
    if path.exists(filename_985):
        if not pretrain_inputs:
            weighted_lines_9_85 = pd.read_csv(filename_985)
        else:
            weighted_lines_9_85 = generate_input_vector(player_box_scores, input_statistics, 9, .85, per_min) 
            weighted_lines_9_85.to_csv(filename_985)
    else:
        weighted_lines_9_85 = generate_input_vector(player_box_scores, input_statistics, 9, .85, per_min) 
        weighted_lines_9_85.to_csv(filename_985)
    if path.exists(filename_1085):
        if not pretrain_inputs:
            weighted_lines_10_85 = pd.read_csv(filename_1085)
        else:
            weighted_lines_10_85 = generate_input_vector(player_box_scores, input_statistics, 10, .85, per_min)
            weighted_lines_10_85.to_csv(filename_1085)
    else:
        weighted_lines_10_85 = generate_input_vector(player_box_scores, input_statistics, 10, .85, per_min)
        weighted_lines_10_85.to_csv(filename_1085)
                
    weighted_lines_to_keep_6_85 = weighted_lines_6_85
    weighted_lines_to_keep_9_85 = weighted_lines_9_85
    weighted_lines_to_keep_10_85 = weighted_lines_10_85
    player_box_scores["Date"] = player_box_scores["Date"].astype(str)
    df_to_keep = player_box_scores[~player_box_scores["Date"].str.contains("2020-10")]
    
    print(weighted_lines_to_keep_6_85)
    
    # Doing some processing to merge name and date and ensure we can access unique statlines.
    
    df_to_keep["attempted_two_point_field_goals"] = df_to_keep["attempted_field_goals"] - df_to_keep["attempted_three_point_field_goals"]
    df_to_keep["made_two_point_field_goals"] = df_to_keep["made_field_goals"] - df_to_keep["made_three_point_field_goals"]
    weighted_lines_to_keep_6_85['name_date'] = weighted_lines_to_keep_6_85["name"] + weighted_lines_to_keep_6_85["date"].astype(str)
    weighted_lines_to_keep_9_85['name_date'] = weighted_lines_to_keep_9_85["name"] + weighted_lines_to_keep_9_85["date"].astype(str)
    weighted_lines_to_keep_10_85['name_date'] = weighted_lines_to_keep_10_85["name"] + weighted_lines_to_keep_10_85["date"].astype(str)
    df_to_keep['name_date'] = df_to_keep["name"] + df_to_keep["Date"]
    
    # Merging the weighted lines (which are just weighted versions of past performances) with the actual lines.
    # We train the models using the weighted lines as inputs to predict each statistic in actual_lines,
    # which means merging both into the same DataFrame. A little processing is required to get the columns
    # to match with output_statistics.
    def merge_with_actual(data, actual):
        merged = data.merge(actual, left_on = 'name_date', right_on = 'name_date')
        merged["rebounds_y"] = merged["offensive_rebounds_y"] + merged["defensive_rebounds_y"]
        merged["location_x"] = merged["location_x"] == "HOME"
        merged["location_y"] = merged["location_y"] == "HOME"
        print(merged.shape, data.shape, actual.shape)
        return merged
        
    df_merged_6_85 = merge_with_actual(weighted_lines_to_keep_6_85, df_to_keep)
    df_merged_9_85 = merge_with_actual(weighted_lines_to_keep_9_85, df_to_keep)
    df_merged_10_85 = merge_with_actual(weighted_lines_to_keep_10_85, df_to_keep)
    
    # Here we isolate the statistics from the weighted lines that we will use as model inputs.
    
    print(len(df_merged_6_85.columns))
    print(df_merged_6_85.columns)
    
    predictors_6_85 = df_merged_6_85.iloc[:, input_indices]
    predictors_9_85 = df_merged_9_85.iloc[:, input_indices]
    predictors_10_85 = df_merged_10_85.iloc[:, input_indices]
    
    print(predictors_6_85)
    
    # Producing the number of fantasy points for each game. Not really relevant but we used it to see if we could
    # make fantasy_points an output to predict with the weighted lines. Turned out not to be very useful, but worth keeping
    # around to see if future models have a better time with it.
    
    df_merged_6_85["fantasy_points"] = [float(get_points(df_merged_6_85[df_merged_6_85["name_date"] == player_name])[0]) for player_name in df_merged_6_85["name_date"]]
    df_merged_9_85["fantasy_points"] = [float(get_points(df_merged_9_85[df_merged_9_85["name_date"] == player_name])[0]) for player_name in df_merged_9_85["name_date"]]
    df_merged_10_85["fantasy_points"] = [float(get_points(df_merged_10_85[df_merged_10_85["name_date"] == player_name])[0]) for player_name in df_merged_10_85["name_date"]]
    
    # Getting the target output columns from their respective weighted line DataFrames.
    #fantasy_points_8_9 = df_merged_8_9.iloc[:,60]
    seconds = df_merged_6_85.iloc[:, output_indices["seconds"]]
    seconds = seconds.astype(float)
    threepoints = df_merged_9_85.iloc[:, output_indices["threepoints"]]
    freethrows = df_merged_10_85.iloc[:, output_indices["freethrows"]]
    assists = df_merged_10_85.iloc[:, output_indices["assists"]]
    steals = df_merged_9_85.iloc[:, output_indices["steals"]]
    blocks = df_merged_10_85.iloc[:, output_indices["blocks"]]
    turnovers = df_merged_9_85.iloc[:, output_indices["turnovers"]]
    twopoints = df_merged_10_85.iloc[:, output_indices["twopoints"]]
    rebounds = df_merged_10_85.iloc[:, output_indices["rebounds"]]

    
    # Training the models! Using the predictors and their corresponding output, selected from the weighted training
    # data that we tested to predict the outputs the best.
    
    def get_model(stat, model, predictors, target, retrain = False):
        fileext = stat + ".joblib"
        temp = tempfile.mkdtemp()
        filename = os.path.join(temp, fileext)
        if not path.exists(fileext) or retrain:
            output = model.fit(predictors, target)
            joblib.dump(output, filename)
            return output
        else:
            return joblib.load(filename, fileext)

    freethrow_model = get_model("freethrow", RidgeCV(), predictors_10_85, freethrows, retrain)
    twopoint_model = get_model("twopoint", BayesianRidge(n_iter = 400), predictors_10_85, twopoints, retrain)
    threepoint_model = get_model("threepoint", GradientBoostingRegressor(), predictors_9_85, threepoints, retrain)
    block_model = get_model("block", RidgeCV(), predictors_10_85, blocks, retrain)
    assist_model = get_model("assist", BayesianRidge(n_iter = 400), predictors_10_85, assists, retrain)
    rebound_model = get_model("rebound", RidgeCV(), predictors_10_85, rebounds, retrain)
    turnover_model = get_model("turnover", RidgeCV(), predictors_9_85, turnovers, retrain)
    steal_model = get_model("steal", RidgeCV(), predictors_9_85, steals, retrain)
    second_model = get_model("second", GradientBoostingRegressor(), predictors_6_85, seconds, retrain)

    # Creating blank output statlines and matching the non-numeric details.
    
    output_statlines = pd.DataFrame(index = weighted_lines_10_85.index, columns = output_statistics).fillna(0)
    output_statlines["name"] = weighted_lines_10_85["name"]
    output_statlines["team"] = weighted_lines_10_85["team"]
    output_statlines["date"] = weighted_lines_10_85["date"]
    output_statlines["location"] = weighted_lines_10_85["location"]
    output_statlines["opponent"] = weighted_lines_10_85["opponent"]
    
    # Processing the weighted lines for use as predictive inputs in the trained models.

    weighted_lines_6_85["location"] = weighted_lines_6_85["location"] == "HOME"
    weighted_lines_9_85["location"] = weighted_lines_9_85["location"] == "HOME"
    weighted_lines_10_85["location"] = weighted_lines_10_85["location"] == "HOME"
    weighted_lines_6_85 = weighted_lines_6_85.iloc[:, input_indices]
    weighted_lines_9_85 = weighted_lines_9_85.iloc[:, input_indices]
    weighted_lines_10_85 = weighted_lines_10_85.iloc[:, input_indices]
    #weighted_lines_8_9 = weighted_lines_8_9.rename(columns={'made_two_point_field_goals': 'made_two_point_field_goals_x', 'attempted_two_point_field_goals': 'attempted_two_point_field_goals_x'})
    
    # Using the models to predict each statistic! We then fill in output_statlines with each prediction.
    
    output_statlines["minutes"] = second_model.predict(weighted_lines_6_85) / 60
    output_statlines["made_two_point_field_goals"] = twopoint_model.predict(weighted_lines_10_85)
    output_statlines["made_three_point_field_goals"] = threepoint_model.predict(weighted_lines_9_85) 
    output_statlines["made_free_throws"] = freethrow_model.predict(weighted_lines_10_85)
    output_statlines["rebounds"] = rebound_model.predict(weighted_lines_10_85)
    output_statlines["assists"] = assist_model.predict(weighted_lines_10_85)
    output_statlines["blocks"] = block_model.predict(weighted_lines_10_85)
    output_statlines["steals"] = steal_model.predict(weighted_lines_9_85) 
    output_statlines["turnovers"] = turnover_model.predict(weighted_lines_9_85)
    #output_statlines["fantasy_points_8_9"] = fantasy_model_8_9.predict(weighted_lines_8_9)
    
    # Going through each row and making all the outputs clean. Dealing with weird outliers and edge cases.
    
    for box_index in output_statlines.index:
#         if output_statlines.loc[box_index, "fantasy_points_8_9"] < -100:
#             output_statlines.loc[box_index, "minutes"] = 1
        pred_minutes = max(0, output_statlines.loc[box_index, "minutes"])
        if pred_minutes <= 19:
            output_statlines.loc[box_index, "made_two_point_field_goals"] = output_statlines.loc[box_index, "made_two_point_field_goals"] * pred_minutes/19
            output_statlines.loc[box_index, "made_three_point_field_goals"] = output_statlines.loc[box_index, "made_three_point_field_goals"] * pred_minutes/19
            output_statlines.loc[box_index, "made_free_throws"] = output_statlines.loc[box_index, "made_free_throws"] * pred_minutes/19
            output_statlines.loc[box_index, "rebounds"] = output_statlines.loc[box_index, "rebounds"] * pred_minutes/19
            output_statlines.loc[box_index, "assists"] = output_statlines.loc[box_index, "assists"] * pred_minutes/19
            output_statlines.loc[box_index, "blocks"] = output_statlines.loc[box_index, "blocks"] * pred_minutes/19
            output_statlines.loc[box_index, "steals"] = output_statlines.loc[box_index, "steals"] * pred_minutes/19
            output_statlines.loc[box_index, "turnovers"] = output_statlines.loc[box_index, "turnovers"] * pred_minutes/19
        output_statlines.loc[box_index, "minutes"] = round(pred_minutes, 2)
        output_statlines.loc[box_index, "made_two_point_field_goals"] = round(max(0, output_statlines.loc[box_index, "made_two_point_field_goals"]), 2)
        output_statlines.loc[box_index, "made_three_point_field_goals"] = round(max(0, output_statlines.loc[box_index, "made_three_point_field_goals"]), 2)
        output_statlines.loc[box_index, "made_free_throws"] = round(max(0, output_statlines.loc[box_index, "made_free_throws"]), 2)
        output_statlines.loc[box_index, "rebounds"] = round(max(0, output_statlines.loc[box_index, "rebounds"]), 2)
        output_statlines.loc[box_index, "assists"] = round(max(0, output_statlines.loc[box_index, "assists"]), 2)
        output_statlines.loc[box_index, "blocks"] = round(max(0, output_statlines.loc[box_index, "blocks"]), 2)
        output_statlines.loc[box_index, "steals"] = round(max(0, output_statlines.loc[box_index, "steals"]), 2)
        output_statlines.loc[box_index, "turnovers"] = round(max(0, output_statlines.loc[box_index, "turnovers"]), 2)
        output_statlines.loc[box_index, "recent_average"] = round(np.mean([recent_average(weighted_lines_10_85.loc[box_index]), recent_average(weighted_lines_6_85.loc[box_index])]), 2)
        last_10_games = get_stats(output_statlines.loc[box_index, "name"], output_statlines.loc[box_index, "date"], 10)
        last_3_games = get_stats(output_statlines.loc[box_index, "name"], output_statlines.loc[box_index, "date"], 3)
        output_statlines.loc[box_index, "10_game_average"] = round(recent_average(last_10_games), 2)
        output_statlines.loc[box_index, "3_game_average"] = round(recent_average(last_3_games), 2)
        output_statlines.loc[box_index, "10_3_ratio"] = (output_statlines.loc[box_index, "10_game_average"] + 1)/(output_statlines.loc[box_index, "3_game_average"] + 1)
        output_statlines.loc[box_index, "10_3_difference"] = output_statlines.loc[box_index, "10_game_average"] - output_statlines.loc[box_index, "3_game_average"]
        output_statlines.loc[box_index, "hot"] = np.log(((-7 * min(0, output_statlines.loc[box_index, "10_3_ratio"] - .83)) * (-1 * min(0, output_statlines.loc[box_index, "10_3_difference"] + 6))) + 1)
        output_statlines.loc[box_index, "cold"] = np.log(((7 * max(0, output_statlines.loc[box_index, "10_3_ratio"] - 1.22)) * (max(0, output_statlines.loc[box_index, "10_3_difference"] - 6.5))) + 1)
    
    # Adding team defense onto the statlines.
    
    output_statlines = add_team_defense(output_statlines)
    return output_statlines

In [29]:
def generate_optimal_lineup(players, positions, values, costs, budget):
    """Generates the optimal lineup given a list of players, their positions, their modeled Fanduel point values,
    their Fanduel salaries and the budget for this lineup.
    
    Params:
    players: Array of strings representing player names.
    positions: Array of strings representing player positions.
    values: Array of float modeled Fanduel values for each player.
    costs: Array of integer salaries for each player.
    budget: Integer, maximum sum of costs in lineup.
    """
    num_variables = len(players)
    
    lp = LpProblem("My LP Problem", LpMaximize)
    
    d = {}
    for i in range(0, num_variables):
        d[players[i]] = LpVariable(players[i], cat="Binary")
    
    objective = sum(np.array(values) * np.array(list(d.values())))
    lp += objective
    
    pg_constraint = 0
    sg_constraint = 0
    sf_constraint = 0
    pf_constraint = 0
    c_constraint = 0
    for i in range(0, len(positions)):
        if positions[i] == "PG":
            pg_constraint += d[players[i]]
        elif positions[i] == "SG":
            sg_constraint += d[players[i]]
        elif positions[i] == "SF":
            sf_constraint += d[players[i]]
        elif positions[i] == "PF":
            pf_constraint += d[players[i]]
        else:
            c_constraint += d[players[i]]
    lp += pg_constraint == 2
    lp += sg_constraint == 2
    lp += sf_constraint == 2
    lp += pf_constraint == 2
    lp += c_constraint == 1
    
    cost = sum(np.array(costs) * np.array(list(d.values())))
    lp += cost <= budget
    
    lp.solve()
    
    lineup = [variable.name for variable in lp.variables() if variable.varValue == 1]
    return lineup

def generate_optimal_lineup_draftkings(players, positions, values, costs, budget):
    """See above documentation for generate_optimal_lineup. Draftkings equivalent.
    """
    num_variables = len(players)
    
    lp = LpProblem("My LP Problem", LpMaximize)
    
    d = {}
    for i in range(0, num_variables):
        d[players[i]] = LpVariable(players[i], cat="Binary")
    
    objective = sum(np.array(values) * np.array(list(d.values())))
    lp += objective
    
    pg_constraint = 0
    sg_constraint = 0
    sf_constraint = 0
    pf_constraint = 0
    c_constraint = 0
    g_constraint = 0
    f_constraint = 0
    player_constraint = 0
    for i in range(0, len(positions)):
        if "PG" in positions[i] or "SG" in positions[i]:
            if "PG" in positions[i]:
                pg_constraint += d[players[i]]
            if "SG" in positions[i]:
                sg_constraint += d[players[i]]
            g_constraint += d[players[i]]
        if "PF" in positions[i] or "SF" in positions[i]:
            if "PF" in positions[i]:
                pf_constraint += d[players[i]]
            if "SF" in positions[i]:
                sf_constraint += d[players[i]]
            f_constraint += d[players[i]]
        if "C" in positions[i]:
            c_constraint += d[players[i]]
        player_constraint += d[players[i]]
    lp += pg_constraint <= 3
    lp += sg_constraint <= 3
    lp += sf_constraint <= 3
    lp += pf_constraint <= 3
    lp += pg_constraint >= 1
    lp += sg_constraint >= 1
    lp += sf_constraint >= 1
    lp += pf_constraint >= 1
    lp += c_constraint >= 1
    lp += g_constraint <= 4
    lp += f_constraint <= 4
    lp += g_constraint >= 3
    lp += f_constraint >= 3
    lp += c_constraint <= 2
    lp += player_constraint == 8
    
    cost = sum(np.array(costs) * np.array(list(d.values())))
    lp += cost <= budget
    
    lp.solve()
    
    lineup = [variable.name for variable in lp.variables() if variable.varValue == 1]
    return lineup

In [30]:
def build_n_lineups_fanduel(players, positions, values, costs, budget, n, sd_param = .35):
    lineups = []
    players_in = {}
    for i in range(n):
        randomness = np.random.normal(loc = 1, scale = sd_param, size = len(values))
        values = values * randomness
        lineup = [n.replace("_", " ") for n in generate_optimal_lineup(players, positions, values, costs, budget)]
        for player in lineup:
            if player not in players_in.keys():
                players_in[player] = 1
            else:
                players_in[player] += 1
        lineups.append(lineup)
    return lineups, players_in
    
def build_n_lineups_draftkings(players, positions, values, costs, budget, n, sd_param = .35):
    lineups = []
    players_in = {}
    for i in range(n):
        randomness = np.random.normal(loc = 1, scale = sd_param, size = len(values))
        values = values * randomness
        lineup = [n.replace("_", " ") for n in generate_optimal_lineup_draftkings(players, positions, values, costs, budget)]
        for player in lineup:
            if player not in players_in.keys():
                players_in[player] = 1
            else:
                players_in[player] += 1
        lineups.append(lineup)
    return lineups, players_in

In [31]:
def box_scores_for_range_of_days(start_date, end_date):
    """Returns pandas DataFrame of all basketball-reference box score statlines between start_date and end_date.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    end_date. Datetime object representing last date to collect data for.
    """
    all_tables = []
    start_month = start_date.month
    end_month = end_date.month
    start_year = start_date.year
    end_year = end_date.year
    
    for y in range(start_year, end_year + 1):
        sm, em = 1, 12
        if y == start_year:
            sm = start_month
        if y == end_year:
            em = end_month
        for m in range(sm, em + 1):
            if m == start_month and y == start_year:
                start_day = start_date.day
            else:
                start_day = 1
            if m == end_month and y == end_year:
                end_day = end_date.day
            else:
                if m == 2:
                    if y % 4 == 0:
                        end_day = 29
                    else:
                        end_day = 28
                elif m in [9, 4, 6, 11]:
                    end_day = 30
                else:
                    end_day = 31
        
            for d in range(start_day, end_day + 1):
                file_name = "./AllCSVs/{0}_{1}_{2}_box_scores.csv".format(m, d, y)
                if not path.exists(file_name):
                    continue
                if pd.read_csv(file_name).empty:
                    client.player_box_scores(day=d, month=m, year=y, output_type=OutputType.CSV, output_file_path=file_name)
                table = pd.read_csv(file_name)
                date = datetime(y, m, d)
                dates = [date] * len(table)
                table["Date"] = dates
                all_tables.append(table)

    full_df = all_tables[0]
    for i in range(1, len(all_tables)):
        current_table = all_tables[i]
        full_df = full_df.append(current_table)
    
    full_df.index = range(full_df.shape[0])
    df = pd.read_csv("./OutputCSVs/updated_team_stats.csv")
    df["team"] = df["team"].str.upper()

    team_def = []
    team_pace = []
    team_tov = []
    opp_def = []
    opp_pace = []
    opp_tov = []
    all_games_teams = full_df[["team", "opponent"]]

    for i in range(len(all_games_teams)):
        game = all_games_teams.loc[i]
        team = game["team"].upper()
        opponent = game["opponent"].upper()
        team_def.append(df[df["team"] == team]["drtg"].iloc[0])
        team_pace.append(df[df["team"] == team]["pace"].iloc[0])
        team_tov.append(df[df["team"] == team]["tov%"].iloc[0])
        opp_def.append(df[df["team"] == opponent]["drtg"].iloc[0])
        opp_pace.append(df[df["team"] == opponent]["pace"].iloc[0])
        opp_tov.append(df[df["team"] == opponent]["tov%"].iloc[0])
    
    full_df["Team Defensive Rating"] = team_def
    full_df["Team Pace"] = team_pace
    full_df["Team Turnover %"] = team_tov
    full_df["Opponent Defensive Rating"] = opp_def
    full_df["Opponent Pace"] = opp_pace
    full_df["Opponent Turnover %"] = opp_tov
    return full_df

In [32]:
def make_predictions(start_date, end_date, output = True, per_min = False):
    """Makes predictions for each date between start_date and end_date. Gets box scores for range of dates,
    calls statline_output to make predicted statlines for each game.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    end_date: Datetime object representing last date to collect data for.
    output: Boolean, whether to write predictions to file.
    """
    full_df = box_scores_for_range_of_days(start_date, end_date)
    predicted_statlines = statline_output(full_df, input_statistics, per_min)
    if output:
        output_filename = './AllCSVs/' + str(start_date.month) + '_' + str(start_date.day) + '_' + str(end_date.month) + '_' + str(end_date.day) + '_' + str(end_date.year) + '_predicted_box_scores.csv'
        predicted_statlines.to_csv(output_filename)
    return predicted_statlines

In [33]:
def predict_next_day(start_date, game_date):
    """Makes predictions for game_date. Calls make_predictions to produce predictions for game_date and
    keeps players who are playing on game_date. Writes predictions to file.
    
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    game_date: Datetime object representing date to produce outputs for.
    """
    season_schedule = client.season_schedule(season_end_year = game_date.year)
    schedule_on_date = [game for game in season_schedule if (game['start_time'] - timedelta(hours = 4)).day == game_date.day and game['start_time'].month == game_date.month]
    team_objs_on_date = [game["home_team"] for game in schedule_on_date] + [game["away_team"] for game in schedule_on_date]
    teams_on_date = [t.name.replace("_", " ") for t in team_objs_on_date]
    predictions_for_date = make_predictions(start_date, game_date, output = False)
    players_on_date = predictions_for_date[predictions_for_date.team.isin(teams_on_date)]
    mstr, dstr = str(game_date.month), str(game_date.day)
    if game_date.month < 10:
        mstr = "0" + mstr
    if game_date.day < 10:
        dstr = "0" + dstr
    tstr = mstr + "-" + dstr
    statlines_on_date = players_on_date[players_on_date.date.str.contains(tstr)]
    statlines_on_date.index = range(statlines_on_date.shape[0])
    output_filename = './AllCSVs/predictions_for_' + mstr + "_" + dstr + '_from_' + str(start_date.month) + '_' + str(start_date.day) + '_' + str(start_date.year) + '.csv'
    statlines_on_date.to_csv(output_filename)
    return statlines_on_date
#feb_28_predictions = predict_next_day(datetime(2020, 1, 27), datetime(2020, 2, 28))

In [34]:
def append_actual_results(start_date, game_date):
    """Created predicted statlines for game_date and, if actual statlines exist, attaches
    actual statlines to predicted statlines for use in testing. Writes output to file.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    game_date: Datetime object representing date to produce outputs for.
    """   
    predicted_results = predict_next_day(start_date, game_date)
    actual_results = box_scores_for_range_of_days(game_date, game_date)
    actual_results["made_two_point_field_goals"] = actual_results["made_field_goals"] - actual_results["made_three_point_field_goals"]
    actual_results["rebounds"] = actual_results["offensive_rebounds"] + actual_results["defensive_rebounds"]
    pred_outputs = [float(get_points(predicted_results[predicted_results["name"] == player_name])[0]) for player_name in predicted_results["name"]]
    actual_outputs = [float(get_points(actual_results[actual_results["name"] == player_name])[0]) for player_name in predicted_results["name"]]
    predicted_results["predicted_points"] = pred_outputs
    predicted_results["actual_points"] = actual_outputs
    output_filename = './AllCSVs/' + str(game_date.month) + '_' + str(game_date.day) + '_' + str(game_date.year) + '_box_scores_predicted_with_actual.csv'
    predicted_results.to_csv(output_filename)
    return predicted_results

#append_actual_results(datetime(2020, 1, 1), datetime(2020, 2, 12))

In [35]:
def alternate_contests(start_date, game_date, search_datetime, inclusive):
    """Makes predictions for game_date, only for game times occurring after search_datetime.
    If inclusive is true, makes predictions for game times on or after search_datetime.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    game_date: Datetime object representing date to produce outputs for.
    search_datetime: Datetime object representing alternate game start time to filter for.
    inclusive: Boolean, determines whether games starting exactly at search_datetime are included.
    """
    season_schedule = client.season_schedule(season_end_year = game_date.year)
    schedule_on_date = [game for game in season_schedule if (game['start_time'] - timedelta(hours = 4)).day == game_date.day and (game['start_time'] - timedelta(hours = 4)).month == game_date.month]
    times_on_date = np.unique([game['start_time'] - timedelta(hours = 4) for game in schedule_on_date])
    print(times_on_date)
    if not inclusive:
        game_teams = [game["home_team"] for game in schedule_on_date if (game['start_time'] - timedelta(hours = 4)) == search_datetime] + [game["away_team"] for game in schedule_on_date if (game['start_time'] - timedelta(hours = 4)) == search_datetime]
    else:
        times_after_search = [t for t in times_on_date if 60 * t.hour + t.minute >= 60 * search_datetime.hour + search_datetime.minute]
        game_teams = [game["home_team"] for game in schedule_on_date if (game['start_time'] - timedelta(hours = 4)) in times_after_search] + [game["away_team"] for game in schedule_on_date if (game['start_time'] - timedelta(hours = 4)) in times_after_search]
    game_team_strings = [str(team)[5:].replace("_", " ") for team in game_teams]
    predictions_for_date = predict_next_day(start_date, game_date)
    players_on_date = predictions_for_date[predictions_for_date.team.isin(game_team_strings)]
    output_filename = './AllCSVs/' + str(game_date.month) + '_' + str(game_date.day) + '_' + str(game_date.year) + '_alternate_' + str(search_datetime.hour) + "_" + str(search_datetime.minute) + '.csv'
    players_on_date.to_csv(output_filename)
    return players_on_date

In [103]:
def matchup_lookup(matchups, team):
    """Returns, given a schedule of games, the opponent of a team playing in the schedule.
    
    Params:
    matchups: A list of tuples where each tuple is a game taking place in the schedule.
    teams: A string representing a team competing in one of the games on the schedule.
    """
    matchup = [m for m in matchups if team in m][0]
    if team == matchup[0]:
        return matchup[1]
    else:
        return matchup[0]

def predict_unplayed_games(start_date, game_date, retrain = False, pretrain_inputs = True):
    """Makes predictions for unplayed games on game_date, using data beginning at start_date to make its
    predictions. Finds the date's schedule, predicts statlines for each player on the competing teams, and
    writes the output predictions to file.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    game_date: Datetime object representing date to produce outputs for.
    """
    season_schedule = client.season_schedule(season_end_year = game_date.year)
    schedule_on_date = [game for game in season_schedule if (game['start_time'] - timedelta(hours = 4)).day == game_date.day and (game['start_time'] - timedelta(hours = 4)).month == game_date.month]
    available_box_scores = box_scores_for_range_of_days(start_date, game_date)
    game_teams = [game["home_team"] for game in schedule_on_date] + [game["away_team"] for game in schedule_on_date]
    matchups = [[str(game["home_team"])[5:].replace("_", " "), str(game["away_team"])[5:].replace("_", " ")] for game in schedule_on_date]
    game_team_strings = [str(team)[5:].replace("_", " ") for team in game_teams]
    relevant_lines = available_box_scores[available_box_scores.team.isin(game_team_strings)]
    relevant_players = relevant_lines.drop_duplicates(subset = ["name"], keep = 'last')
    mstr, dstr = str(game_date.month), str(game_date.day)
    if game_date.month < 10:
        mstr = "0" + mstr
    if game_date.day < 10:
        dstr = "0" + dstr
    tstr = str(game_date.year) + "-" + mstr + "-" + dstr
    relevant_players.Date = [tstr for _ in range(len(relevant_players))]
    relevant_players.opponent = [matchup_lookup(matchups, t) for t in relevant_players.team]
    augmented_box_scores = available_box_scores.append(relevant_players)
    augmented_box_scores.index = range(augmented_box_scores.shape[0])
    predicted_statlines = statline_output(augmented_box_scores, input_statistics, retrain = retrain, pretrain_inputs = pretrain_inputs)
    statlines_on_date = predicted_statlines[predicted_statlines.date.str.contains(tstr)]
    statlines_on_date['projected_points'] = [round(float(get_points(statlines_on_date[statlines_on_date["name"] == player_name])[0]), 2) for player_name in statlines_on_date.name]
    statlines_on_date['projected_points_draftkings'] = [round(float(get_draftkings_points(statlines_on_date[statlines_on_date["name"] == player_name])[0]), 2) for player_name in statlines_on_date.name]
    statlines_on_date['projected_value'] = [round(float(get_points(statlines_on_date[statlines_on_date["name"] == player_name])[1])) for player_name in statlines_on_date.name]
    statlines_on_date['projected_value_draftkings'] = [round(float(get_draftkings_points(statlines_on_date[statlines_on_date["name"] == player_name])[1])) for player_name in statlines_on_date.name]
    output_filename = './AllCSVs/predictions_for_' + mstr + "_" + dstr + "_" + str(game_date.year) + "_unplayed.csv"
    statlines_on_date.to_csv(output_filename)
    return statlines_on_date

# Here is where we make predictions for a day of games!

april_20_predictions = predict_unplayed_games(datetime(2016, 11, 1), datetime(2021, 4, 20), retrain = True, pretrain_inputs = True)
april_20_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


(116824, 26)
(116824, 27)
(116824, 28)
(116824, 31)
(116824, 35)
(116824, 39)
(116824, 44)
b
(116824, 26)
(116824, 27)
(116824, 28)
(116824, 31)
(116824, 35)
(116824, 39)
(116824, 44)
(116824, 26)
(116824, 27)
(116824, 28)
(116824, 31)
(116824, 35)
(116824, 39)
(116824, 44)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_keep["attempted_two_point_field_goals"] = df_to_keep["attempted_field_goals"] - df_to_keep["attempted_three_point_field_goals"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_ke

(480, 76) (116824, 45) (116728, 31)
(480, 76) (116824, 45) (116728, 31)
(480, 76) (116824, 45) (116728, 31)
76
Index(['name_x', 'team_x', 'date', 'location_x', 'opponent_x',
       'made_field_goals_x', 'made_two_point_field_goals_x',
       'attempted_two_point_field_goals_x', 'attempted_field_goals_x',
       'made_three_point_field_goals_x', 'attempted_three_point_field_goals_x',
       'attempted_free_throws_x', 'made_free_throws_x', 'offensive_rebounds_x',
       'defensive_rebounds_x', 'assists_x', 'blocks_x', 'turnovers_x',
       'steals_x', 'seconds_played_x', 'Opponent Defensive Rating_x',
       'Opponent Turnover %_x', 'Team Defensive Rating_x', 'Team Pace_x',
       'Team Turnover %_x', 'Opponent Pace_x',
       'Opponent Defensive Rank vs Position', 'total', 'Iso POSS', 'Iso PPP',
       'Iso FGA', 'Opp D Iso POSS', 'Opp D Iso PPP', 'Opp D Iso FGA',
       'Opp D Iso Score %', 'Team Iso POSS', 'Team Iso PPP', 'Team Iso FGA',
       'Team Iso Score %', 'Usage Rate', 'OReb 

  output_statlines.loc[box_index, "10_3_ratio"] = (output_statlines.loc[box_index, "10_game_average"] + 1)/(output_statlines.loc[box_index, "3_game_average"] + 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  statlines_on_date['projected_points'] = [round(float(get_points(statlines_on_date[statlines_on_date["name"] == player_name])[0]), 2) for player_name in statlines_on_date.name]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  statlines_on_date['projected_points_draftkings'] = [round(float(get_draftkings_points(statlines_on_date[statlines_on_date["name"] =

Unnamed: 0,name,team,date,location,opponent,minutes,made_two_point_field_goals,made_three_point_field_goals,made_free_throws,rebounds,assists,blocks,steals,turnovers,recent_average,10_game_average,3_game_average,10_3_ratio,10_3_difference,hot,cold,fantasy_points,Opponent Defensive Rank vs Position,projected_points,projected_points_draftkings,projected_value,projected_value_draftkings
116344,Jordan Farmar,SACRAMENTO KINGS,2021-04-20,AWAY,MINNESOTA TIMBERWOLVES,13.4,1.27,0.95,0.85,1.12,2.83,0.0,0.5,1.1,15.26,16.05,16.05,1.0,0.0,0.0,0.0,0,15.5,12.23,12.81,2446,2402
116345,John Lucas III,MINNESOTA TIMBERWOLVES,2021-04-20,AWAY,SACRAMENTO KINGS,2.93,0.1,0.04,0.0,0.07,0.11,0.0,0.04,0.05,1.33,1.9,3.17,0.695444,-1.27,0.0,0.0,0,15.5,0.64,0.65,128,121
116346,Aaron Harrison,CHARLOTTE HORNETS,2021-04-20,HOME,NEW YORK KNICKS,15.32,1.01,0.84,0.89,1.77,1.01,0.24,0.38,0.53,12.8,13.8,12.8,1.072464,1.0,0.0,0.0,0,15.5,10.4,10.55,2080,1979
116347,Kyle Korver,ATLANTA HAWKS,2021-04-20,AWAY,ORLANDO MAGIC,14.42,0.71,0.3,1.11,1.3,0.98,0.13,0.36,0.39,11.4,10.97,10.6,1.031897,0.37,0.0,0.0,0,28.0,7.54,7.46,1508,1399
116348,Arinze Onuaku,ORLANDO MAGIC,2021-04-20,HOME,ATLANTA HAWKS,7.02,0.2,0.06,0.15,0.46,0.18,0.04,0.05,0.06,1.97,1.9,2.07,0.944625,-0.17,0.0,0.0,0,15.5,1.76,1.76,352,329
116349,Anthony Bennett,BROOKLYN NETS,2021-04-20,AWAY,NEW ORLEANS PELICANS,5.95,0.38,0.16,0.24,0.85,0.32,0.08,0.12,0.16,8.45,9.98,3.87,2.25462,6.11,0.0,0.0,0,15.5,3.42,3.42,684,642
116350,Rudy Gay,SACRAMENTO KINGS,2021-04-20,HOME,MINNESOTA TIMBERWOLVES,26.75,2.54,2.6,1.29,4.09,1.61,0.43,0.73,1.37,24.66,23.04,28.07,0.826969,-5.03,0.0,0.0,0,26.0,23.6,24.63,4721,4619
116351,Gary Neal,ATLANTA HAWKS,2021-04-20,HOME,ORLANDO MAGIC,5.45,0.19,0.0,0.25,0.07,0.17,0.0,0.1,0.17,3.08,3.35,3.35,1.0,0.0,0.0,0.0,0,15.5,1.1,1.09,220,204
116352,Roy Hibbert,CHARLOTTE HORNETS,2021-04-20,AWAY,NEW YORK KNICKS,10.91,1.08,0.0,0.51,1.7,0.27,0.2,0.19,0.24,9.85,11.13,11.6,0.962698,-0.47,0.0,0.0,0,15.5,6.04,5.86,1209,1099
116353,Spencer Hawes,CHARLOTTE HORNETS,2021-04-20,AWAY,NEW YORK KNICKS,22.37,2.71,0.21,0.69,5.16,1.94,0.52,0.63,1.02,19.48,20.07,21.7,0.928194,-1.63,0.0,0.0,0,15.5,18.27,18.0,3654,3374


In [None]:
#Testing cell

# contests(sport=Sport.NBA)
# draftables(draft_group_id=4)
# draft_group_details(draft_group_id=1)
# available_players(draft_group_id=4)
#make_predictions(datetime(2019, 11, 1), datetime(2020, 7, 31))
season_predictions = pd.read_csv("./AllCSVs/11_1_7_31_2020_predicted_box_scores.csv")
#actual_box_scores = box_scores_for_range_of_days(datetime(2019, 11, 1), datetime(2020, 7, 31))
actual_box_scores["made_two_point_field_goals"] = actual_box_scores["made_field_goals"] - actual_box_scores["made_three_point_field_goals"]
season_predictions = season_predictions[~season_predictions.date.str.contains("2019-11")]
season_predictions['projected_points'] = 3 * season_predictions["made_three_point_field_goals"] + 2 * season_predictions["made_two_point_field_goals"] + season_predictions["made_free_throws"] + 1.2 * season_predictions["rebounds"] + 1.5 * season_predictions["assists"] + 3 * season_predictions["blocks"] + 3 * season_predictions["steals"] - season_predictions["turnovers"]
actual_box_scores['actual_points'] = 3 * actual_box_scores["made_three_point_field_goals"] + 2 *actual_box_scores["made_two_point_field_goals"] + actual_box_scores["made_free_throws"] + 1.2 * (actual_box_scores["offensive_rebounds"] + actual_box_scores["defensive_rebounds"]) + 1.5 * actual_box_scores["assists"] + 3 * actual_box_scores["blocks"] + 3 * actual_box_scores["steals"] - actual_box_scores["turnovers"]
future_games = preds.append(preds_1)as
sns.distplot(season_predictions["projected_points"])
#sns.distplot(actual_box_scores["actual_points"])
sns.distplot(future_games["projected_points"], color = "green")
pyplot.axvline(73.85, 0, .1, color = "red")

In [None]:
box_scores_for_range_of_days(datetime(2021, 1, 1), datetime(2021, 1, 1))

In [37]:
# Maps of names from those in the data to the formats Fanduel and Draftkings have.
# These only include players whose teams were in the bubble, and of course are not yet updated
# for the 2020-21 season. Additional work is required to make these lists complete.

difficult_names_map_fanduel = {"Luka Dončić": "Luka Doncic", 
                       "Luka Šamanić": "Luka Samanic", 
                       "Kristaps Porziņģis": "Kristaps Porzingis", 
                       "Nikola Vučević": "Nikola Vucevic",
                       "Jonas Valančiūnas": "Jonas Valanciunas",
                       "Bogdan Bogdanović": "Bogdan Bogdanovic",
                       "Dario Šarić": "Dario Saric",
                       "Timothé Luwawu-Cabarrot": "Timothe Luwawu-Cabarrot",
                       "Džanan Musa": "Dzanan Musa",
                        "Dāvis Bertāns": "Davis Bertans",
                        "Boban Marjanović": "Boban Marjanovic",
                        "Ersan İlyasova": "Ersan Ilyasova",
                        "Anžejs Pasečņiks": "Anzejs Pasecniks",
                       "Bojan Bogdanović": "Bojan Bogdanovic",
                        "Nicolò Melli": "Nicolo Melli",
                        "Nikola Jokić": "Nikola Jokic",
                        "Jusuf Nurkić": "Jusuf Nurkic",
                        "Goran Dragić": "Goran Dragic",
                        "Dennis Schröder" :"Dennis Schroder",
                       "Gary Payton": "Gary Payton II",
                       "Mohamed Bamba": "Mo Bamba",
                       "Wesley Iwundu": "Wes Iwundu",
                        "J.J. Redick": "JJ Redick",
                        "B.J. Johnson": "BJ Johnson"} #Check this for August 1

difficult_names_map_draftkings = {"Luka Dončić": "Luka Doncic", 
                       "Luka Šamanić": "Luka Samanic", 
                       "Kristaps Porziņģis": "Kristaps Porzingis", 
                       "Nikola Vučević": "Nikola Vucevic",
                       "Jonas Valančiūnas": "Jonas Valanciunas",
                       "Bogdan Bogdanović": "Bogdan Bogdanovic",
                       "Dario Šarić": "Dario Saric",
                       "Timothé Luwawu-Cabarrot": "Timothe Luwawu-Cabarrot",
                       "Džanan Musa": "Dzanan Musa",
                        "Boban Marjanović": "Boban Marjanovic",
                        "Ersan İlyasova": "Ersan Ilyasova",
                        "Anžejs Pasečņiks": "Anzejs Pasecniks",
                       "Bojan Bogdanović": "Bojan Bogdanovic",
                        "Dāvis Bertāns": "Davis Bertans",
                        "Nicolò Melli": "Nicolo Melli",
                        "Nikola Jokić": "Nikola Jokic",
                        "Jusuf Nurkić": "Jusuf Nurkic",
                        "Goran Dragić": "Goran Dragic",
                        "Dennis Schröder" :"Dennis Schroder",
                       "Gary Payton": "Gary Payton II",
                       "Frank Mason": "Frank Mason III",
                       "Marvin Bagley": "Marvin Bagley III",
                       "James Ennis": "James Ennis III",
                       "Harry Giles": "Harry Giles III",
                        "Lonnie Walker": "Lonnie Walker IV",
                       "Mohamed Bamba": "Mo Bamba",
                       "Wesley Iwundu": "Wes Iwundu",
                        "J.J. Redick": "JJ Redick",
                        "B.J. Johnson": "BJ Johnson",
                        "Melvin Frazier": "Melvin Frazier Jr.",
                        "Gary Trent": "Gary Trent Jr.",
                        "Danuel House": "Danuel House Jr.",
                        "Tim Hardaway": "Tim Hardaway Jr.",
                        "Jaren Jackson": "Jaren Jackson Jr.",
                        "Kelly Oubre": "Kelly Oubre Jr.",
                        "Troy Brown": "Troy Brown Jr.",
                        "Marcus Morris": "Marcus Morris Sr."} #Check this for August 1

# Incomplete list of players whose names the optimal_lineup function had difficulty handling.
# If the optimal lineup is one player short, that player may belong in this dictionary.


punctuation_names = {"Kentavious Caldwell Pope": "Kentavious Caldwell-Pope",
                    "Marcus Morris Sr.": "Marcus Morris",
                    "Shai Gilgeous Alexander": "Shai Gilgeous-Alexander",
                    "Al Farouq Aminu": "Al-Farouq Aminu",
                    "Naz Mitrou Long": "Naz Mitrou-Long",
                    "Talen Horton Tucker": "Talen Horton-Tucker",
                    "Willie Cauley Stein": "Willie Cauley-Stein",
                    "Karl Anthony Towns": "Karl-Anthony Towns",
                    "Timothe Luwawu Cabarrot": "Timothe Luwawu-Cabarrot",
                    "Troy Brown Jr.": "Troy Brown",
                    "Danuel House Jr.": "Danuel House",
                    "Tim Hardaway Jr.": "Tim Hardaway",
                    "Kelly Oubre Jr.": "Kelly Oubre",
                    "Dorian Finney Smith": "Dorian Finney-Smith",
                    "Juan Toscano Anderson": "Juan Toscano-Anderson",
                    "Michael Carter Williams": "Michael Carter-Williams",
                    "Nickeil Alexander Walker": "Nickeil Alexander-Walker"}
fd_to_dk = {"James Ennis": "James Ennis III",
           "Gary Trent": "Gary Trent Jr.",
           "Marcus Morris": "Marcus Morris Sr.",
           "Tim Hardaway": "Tim Hardaway Jr."}

In [None]:
def per_minute_projections(projections):
    statline_categories = ['2PT FG', '3PT FG', 'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers']
    statline = projections.loc[:, statline_categories]
    per_minute_statline = statline.div(projections['Minutes'], axis=0)
    projections.loc[:,statline_categories] = per_minute_statline
    projections = projections.drop(["Minutes"], axis=1)
    projections["FanDuel Points Per Minute"] = 2 * projections["2PT FG"] + 3 * projections["3PT FG"] + projections["FTM"] + 1.2 * projections["Rebounds"] + 1.5 * projections["Assists"] + 3 * projections["Blocks"] + 3 * projections["Steals"] - projections["Turnovers"]
    projections["DraftKings Points Per Minute"] = 2 * projections["2PT FG"] + 3.5 * projections["3PT FG"] + projections["FTM"] + 1.25 * projections["Rebounds"] + 1.5 * projections["Assists"] + 2 * projections["Blocks"] + 2 * projections["Steals"] - .5 * projections["Turnovers"]
    projections = projections[['Name', 'Position', 'Game', 'Team',
        'FanDuel Points Per Minute', 'DraftKings Points Per Minute', 
        '2PT FG', '3PT FG', 'FTM', 'Rebounds', 'Assists', 'Blocks', 
        'Steals', 'Turnovers', 'Opponent Defensive Rank vs Position', 
        'Injury Indicator', 'Injury Details']]
    return projections

per_minute_projections(pd.read_csv("./AllCSVs/statline_projections_to_display_4_15_2021.csv")).sort_values("DraftKings Points Per Minute", ascending = False)

In [107]:
def optimal_lineup_fanduel_games(game_date, predictions, fanduel_csv, players_out = []):
    """Given a game_date, a csv of predicted statlines, and Fanduel's csv containing data for this contest including
    player salaries and injury information, produces the optimal lineup for Fanduel. Injured players should be
    manually entered into players_out so as to exclude them from the optimal lineup.
    
    Params:
    game_date: Datetime object representing date to produce outputs for.
    predictions: Predicted statlines for the game_date. Should have a csv written to file that is generated
        by calling predict_unplayed_games earlier.
    fanduel_csv: CSV of data for the Fanduel contest.
    players_out: List of players who will not be available. Usually manually updated, if this function produces
        an optimal lineup with an injured player, add the player's name to players_out and run the function again.
    """
    fanduel_data = pd.read_csv(fanduel_csv).fillna({"FPPG": 0})
    print("Done predicting!")
    for i in predictions.index:
        if predictions.loc[i, "name"] in difficult_names_map_fanduel.keys():
            n = predictions.loc[i, "name"]
            predictions.loc[i, "name"] = difficult_names_map_fanduel[n]
    fanduel_data_with_predictions = fanduel_data.merge(predictions, left_on = "Nickname", right_on = "name").fillna(" ")
    fanduel_data_with_predictions["FPPG"] = fanduel_data_with_predictions["FPPG"].round(2)
    fanduel_data_with_predictions["Value above Market Value"] = fanduel_data_with_predictions["projected_value"] - fanduel_data_with_predictions["Salary"]
    fanduel_data_with_predictions["Value above Market Value"] = np.where(fanduel_data_with_predictions["Value above Market Value"] < 0, '-$' + fanduel_data_with_predictions["Value above Market Value"].astype(str).str[1:], '$' + fanduel_data_with_predictions["Value above Market Value"].astype(str))
    fanduel_data_with_predictions_injuries = fanduel_data_with_predictions[fanduel_data_with_predictions["Injury Indicator"] != "O"]
    players_not_out = [(name not in players_out) for name in fanduel_data_with_predictions_injuries["Nickname"]]
    fanduel_data_with_predictions_injuries = fanduel_data_with_predictions_injuries[players_not_out]
    fanduel_data_with_predictions_injuries.index = range(fanduel_data_with_predictions_injuries.shape[0])
    #print(fanduel_data_with_predictions_injuries)
    values = fanduel_data_with_predictions_injuries["projected_points"]
    players = fanduel_data_with_predictions_injuries["Nickname"]
    positions = fanduel_data_with_predictions_injuries["Position"]
    costs = fanduel_data_with_predictions_injuries["Salary"]
    data_to_feed = pd.DataFrame(data = {'players': players, 'positions': positions, 'values': values, 'costs': costs})
    optimal_lineup = [n.replace("_", " ") for n in generate_optimal_lineup(players, positions, values, costs, 60000)]
    print(optimal_lineup)
    for i in range(len(optimal_lineup)):
        if optimal_lineup[i] in punctuation_names.keys():
            optimal_lineup[i] = punctuation_names[optimal_lineup[i]]
    fanduel_data_with_predictions_injuries["projected_value"] = ['$' + str(s) for s in fanduel_data_with_predictions_injuries["projected_value"]]
    fanduel_data_with_predictions_injuries["Salary"] = ['$' + str(s) for s in fanduel_data_with_predictions_injuries["Salary"]]
    projs_in_optimal = [(fanduel_data_with_predictions_injuries.loc[i, "Nickname"] in optimal_lineup) for i in fanduel_data_with_predictions_injuries.index]
    lineup_return = fanduel_data_with_predictions_injuries[projs_in_optimal].sort_values(by=["Position"], ascending = False)
    lineup_return = lineup_return.loc[:, ["Nickname", "Position", "Game", "Team", 'projected_points', 'projected_value', "Salary", "Value above Market Value", 'projected_points_draftkings', 'projected_value_draftkings', "FPPG", '10_game_average', '3_game_average', "Injury Indicator", "Injury Details", "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers','hot', 'cold']]
    lineup_return.columns = ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Fanduel Salary",  "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG (Fanduel)", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Injury Indicator", "Injury Details", "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers','Hot', 'Cold']
    csv_return = fanduel_data_with_predictions.loc[:, ["Nickname", "Position", "Game", "Team", 'projected_points', 'projected_value', "Salary", "Value above Market Value", 'projected_points_draftkings', 'projected_value_draftkings', "FPPG", '10_game_average', '3_game_average', "Injury Indicator", "Injury Details", "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers', "Opponent Defensive Rank vs Position", 'hot', 'cold']]
    csv_return.columns = ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Fanduel Salary", "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG (Fanduel)", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Injury Indicator", "Injury Details", "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position", 'Hot', 'Cold']
    csv_return_dfs = csv_return.loc[:, ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Fanduel Salary", "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG (Fanduel)", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Opponent Defensive Rank vs Position",  "Injury Indicator", "Injury Details", 'Hot', 'Cold']]
    csv_return_projections = csv_return.loc[:, ["Name", "Position", "Game", "Team", "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position", "Injury Indicator", "Injury Details"]]
    lineup_return = lineup_return.loc[:, ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Fanduel Salary", "Value above Fanduel Value", "FPPG (Fanduel)", "Injury Indicator", "Injury Details", 'Hot', 'Cold']]
    
    csv_return_dfs.to_csv("./AllCSVs/dfs_projections_to_display_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    csv_return_projections.to_csv("./AllCSVs/statline_projections_to_display_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    lineup_return.to_csv("./AllCSVs/optimal_fanduel_lineup_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    return csv_return_dfs, csv_return_projections, lineup_return

def optimal_lineup_draftkings_games(game_date, predictions, draftkings_csv, players_out = []):
    """See above documentation for optimal_lineup_fanduel_games. Draftkings equivalent."""
    draftkings_data = pd.read_csv(draftkings_csv)
    print("Done predicting!")
    for i in predictions.index:
        if predictions.loc[i, "name"] in difficult_names_map_draftkings.keys():
            n = predictions.loc[i, "name"]
            predictions.loc[i, "name"] = difficult_names_map_draftkings[n]
    draftkings_data_with_predictions = draftkings_data.merge(predictions, left_on = "Name", right_on = "name")
    draftkings_data_with_predictions["AvgPointsPerGame"] = draftkings_data_with_predictions["AvgPointsPerGame"].round(2)
    draftkings_data_with_predictions["Game Info"] = [g[:7] for g in draftkings_data_with_predictions["Game Info"]]
    draftkings_data_with_predictions["Value above Market Value"] = draftkings_data_with_predictions["projected_value_draftkings"] - draftkings_data_with_predictions["Salary"]
    draftkings_data_with_predictions["Value above Market Value"] = np.where(draftkings_data_with_predictions["Value above Market Value"] < 0, '-$' + draftkings_data_with_predictions["Value above Market Value"].astype(str).str[1:], '$' + draftkings_data_with_predictions["Value above Market Value"].astype(str))
    draftkings_data_with_predictions_injuries = draftkings_data_with_predictions#[draftkings_data_with_predictions["Injury Indicator"] != "O"]
    players_not_out = [(name not in players_out) for name in draftkings_data_with_predictions_injuries["Name"]]
    draftkings_data_with_predictions_injuries = draftkings_data_with_predictions_injuries[players_not_out]
    draftkings_data_with_predictions_injuries.index = range(draftkings_data_with_predictions_injuries.shape[0])
    #print(fanduel_data_with_predictions_injuries)
    draftkings_data_with_predictions_injuries.index = range(len(draftkings_data_with_predictions_injuries))
    values = draftkings_data_with_predictions_injuries["projected_points_draftkings"]
    players = draftkings_data_with_predictions_injuries["Name"]
    positions_as_lists = [list(st.split("/")) for st in draftkings_data_with_predictions_injuries["Position"]]
    positions = draftkings_data_with_predictions_injuries["Position"]
    costs = draftkings_data_with_predictions_injuries["Salary"]
    data_to_feed = pd.DataFrame(data = {'players': players, 'positions': positions_as_lists, 'values': values, 'costs': costs})
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    print(data_to_feed)
    optimal_lineup = [n.replace("_", " ") for n in generate_optimal_lineup_draftkings(players, positions_as_lists, values, costs, 50000)]
    print(optimal_lineup)
    for i in range(len(optimal_lineup)):
        if optimal_lineup[i] in punctuation_names.keys():
            optimal_lineup[i] = punctuation_names[optimal_lineup[i]]
    draftkings_data_with_predictions_injuries["projected_value_draftkings"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["projected_value_draftkings"]]
    draftkings_data_with_predictions_injuries["Salary"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["Salary"]]
    projs_in_optimal = [(draftkings_data_with_predictions_injuries.loc[i, "Name"] in optimal_lineup) for i in draftkings_data_with_predictions_injuries.index]
    lineup_return = draftkings_data_with_predictions_injuries[projs_in_optimal].sort_values(by=["Position"], ascending = False)
    lineup_return = lineup_return.loc[:, ["Name", "Position", "Game Info", "TeamAbbrev", 'projected_points', 'projected_value', 'projected_points_draftkings', 'projected_value_draftkings', "Salary", "Value above Market Value", "AvgPointsPerGame", '10_game_average', '3_game_average', "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers','hot', 'cold']].fillna(" ")
    lineup_return.columns = ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', 'Projected Draftkings Points', 'Projected Draftkings Value', "Draftkings Salary", "Value above Draftkings Value", "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers','Hot', 'Cold']
    csv_return = draftkings_data_with_predictions.loc[:, ["Name", "Position", "Salary", "Game Info", "TeamAbbrev", 'projected_points', 'projected_value', "Value above Market Value", 'projected_points_draftkings', 'projected_value_draftkings', "AvgPointsPerGame", '10_game_average', '3_game_average', "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers', "Opponent Defensive Rank vs Position", 'hot', 'cold']].fillna(" ")
    csv_return.columns = ["Name", "Position", "Salary", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position", 'Hot', 'Cold']
    csv_return_dfs = csv_return.loc[:, ["Name", "Position", "Salary", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Opponent Defensive Rank vs Position", 'Hot', 'Cold']]
    csv_return_projections = csv_return.loc[:, ["Name", "Position", "Game", "Team", "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position"]]
    lineup_return = lineup_return.loc[:, ["Name", "Position", "Game", "Team", 'Projected Draftkings Points', 'Projected Draftkings Value', "Draftkings Salary", "Value above Draftkings Value", "FPPG", 'Hot', 'Cold']]
    
    fanduel_statlines = pd.read_csv("./AllCSVs/statline_projections_to_display_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    injury_indicators = []
    injury_details = []
    for i in lineup_return.index:
        print(lineup_return.loc[i, "Name"])
        for n in fanduel_statlines.index:
            if fanduel_statlines.loc[n, "Name"] in fd_to_dk.keys():
                nam = fd_to_dk[fanduel_statlines.loc[n, "Name"]]
                fanduel_statlines.loc[n, "Name"] = nam
            if fanduel_statlines.loc[n, "Name"] == lineup_return.loc[i, "Name"]:
                print(i)
                injury_indicators.append(fanduel_statlines.loc[n, "Injury Indicator"])
                injury_details.append(fanduel_statlines.loc[n, "Injury Details"])
            elif lineup_return.loc[i, "Name"] in punctuation_names.keys():
                if punctuation_names[lineup_return.loc[i, "Name"]] == fanduel_statline.loc[n, "Name"]:
                    print(i)
                    injury_indicators.append(fanduel_statlines.loc[n, "Injury Indicator"])
                    injury_details.append(fanduel_statlines.loc[n, "Injury Details"])
    lineup_return["Injury Indicator"] = injury_indicators
    lineup_return["Injury Details"] = injury_details
    
    lineup_return.to_csv("./AllCSVs/optimal_draftkings_lineup_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    return csv_return_dfs, csv_return_projections, lineup_return

preds = pd.read_csv("./AllCSVs/predictions_for_04_20_2021_unplayed.csv")#.append(pd.read_csv("./AllCSVs/predictions_for_09_20_2020_unplayed.csv"))
#preds.index = range(0, len(preds))

fd_csv = "./AllCSVs/FanDuel-NBA-2021-04-20-players-list.csv"

dk_csv = "./AllCSVs/DKSalaries_04202021.csv"

players_out = ["Victor Oladipo",
               "Shai Gilgeous-Alexander",
               "Eric Paschall",
               "Alec Burks",
               "Michael Carter-Williams",
               "Zach LaVine",
               "Devonte' Graham",
               "Al Horford",
               "Kris Dunn",
               "Kevin Durant",
               "Jamal Murray",
               "Danilo Gallinari",
               "Evan Fournier",
               "Mike Muscala",
               "Malik Monk",
               "LaMelo Ball",
               "Eric Gordon",
               "LeBron James", 
               "Markelle Fultz", 
               "Spencer Dinwiddie", 
               "Anthony Davis", 
               "T.J. Warren", 
               "Marquese Chriss",
               "Jonathan Isaac", 
               "Klay Thompson", 
               "Jabari Parker"]

#optimal_lineup_fanduel_games(datetime(2021, 4, 20), preds, fd_csv, players_out)[2]
optimal_lineup_draftkings_games(datetime(2021, 4, 20), preds, dk_csv, players_out)[2]

Done predicting!
                      players positions  values  costs
0                James Harden      [PG]   22.38  10800
1          Karl-Anthony Towns       [C]   46.65  10600
2               Julius Randle   [PF, C]   54.93  10100
3                Kyrie Irving  [PG, SG]   36.19   9900
4                De'Aaron Fox      [PG]   35.59   9800
5             Zion Williamson   [PF, C]   36.45   9700
6              Damian Lillard      [PG]   42.81   9600
7                 Paul George      [SG]   42.44   9400
8               Kawhi Leonard      [SF]   39.82   9200
9                Clint Capela       [C]   32.60   9100
10                 Trae Young      [PG]   31.03   9000
11                CJ McCollum  [PG, SG]   25.22   8900
12             Brandon Ingram      [SF]   39.40   8400
13               Terry Rozier  [PG, SG]   33.31   8000
14          Bogdan Bogdanovic      [SG]   37.14   7800
15              Miles Bridges  [SF, PF]   39.56   7500
16             Gordon Hayward      [PF]   33.76 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  draftkings_data_with_predictions_injuries["projected_value_draftkings"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["projected_value_draftkings"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  draftkings_data_with_predictions_injuries["Salary"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["Salary"]]



Buddy Hield
20
Landry Shamet
67
Miles Bridges
15
Solomon Hill
57
D'Angelo Russell
27
Cole Anthony
26
Julius Randle
2
Cody Zeller
80


Unnamed: 0,Name,Position,Game,Team,Projected Draftkings Points,Projected Draftkings Value,Draftkings Salary,Value above Draftkings Value,FPPG,Hot,Cold,Injury Indicator,Injury Details
20,Buddy Hield,SG/SF,MIN@SAC,SAC,37.21,$6977,$6800,$177,30.3,0.0,0.0,,
67,Landry Shamet,SG,BKN@NO,BKN,30.71,$5758,$4400,$1358,15.29,0.0,0.0,,
15,Miles Bridges,SF/PF,CHA@NY,CHA,39.56,$7417,$7500,-$83,24.7,0.0,0.0,,
57,Solomon Hill,SF/PF,ORL@ATL,ATL,25.52,$4784,$4600,$184,11.57,0.0,0.0,,
27,D'Angelo Russell,PG/SG,MIN@SAC,MIN,29.91,$5609,$6000,-$391,32.29,0.0,0.0,,
26,Cole Anthony,PG,ORL@ATL,ORL,30.38,$5697,$6000,-$303,25.02,0.0,0.0,,
2,Julius Randle,PF/C,CHA@NY,NY,54.93,$10299,$10100,$199,48.93,0.0,0.0,,
80,Cody Zeller,C,CHA@NY,CHA,20.22,$3792,$3900,-$108,22.75,0.0,0.0,,


In [71]:
def gen_n_lineups_draftkings(game_date, predictions, draftkings_csv, players_out = [], n = 100):
    """See above documentation for optimal_lineup_fanduel_games. Draftkings equivalent."""
    draftkings_data = pd.read_csv(draftkings_csv)
    print("Done predicting!")
    for i in predictions.index:
        if predictions.loc[i, "name"] in difficult_names_map_draftkings.keys():
            n = predictions.loc[i, "name"]
            predictions.loc[i, "name"] = difficult_names_map_draftkings[n]
    draftkings_data_with_predictions = draftkings_data.merge(predictions, left_on = "Name", right_on = "name")
    draftkings_data_with_predictions["AvgPointsPerGame"] = draftkings_data_with_predictions["AvgPointsPerGame"].round(2)
    draftkings_data_with_predictions["Game Info"] = [g[:7] for g in draftkings_data_with_predictions["Game Info"]]
    draftkings_data_with_predictions["Value above Market Value"] = draftkings_data_with_predictions["projected_value_draftkings"] - draftkings_data_with_predictions["Salary"]
    draftkings_data_with_predictions["Value above Market Value"] = np.where(draftkings_data_with_predictions["Value above Market Value"] < 0, '-$' + draftkings_data_with_predictions["Value above Market Value"].astype(str).str[1:], '$' + draftkings_data_with_predictions["Value above Market Value"].astype(str))
    draftkings_data_with_predictions_injuries = draftkings_data_with_predictions#[draftkings_data_with_predictions["Injury Indicator"] != "O"]
    players_not_out = [(name not in players_out) for name in draftkings_data_with_predictions_injuries["Name"]]
    draftkings_data_with_predictions_injuries = draftkings_data_with_predictions_injuries[players_not_out]
    draftkings_data_with_predictions_injuries.index = range(draftkings_data_with_predictions_injuries.shape[0])
    #print(fanduel_data_with_predictions_injuries)
    draftkings_data_with_predictions_injuries.index = range(len(draftkings_data_with_predictions_injuries))
    values = draftkings_data_with_predictions_injuries["projected_points_draftkings"]
    players = draftkings_data_with_predictions_injuries["Name"]
    positions_as_lists = [list(st.split("/")) for st in draftkings_data_with_predictions_injuries["Position"]]
    positions = draftkings_data_with_predictions_injuries["Position"]
    costs = draftkings_data_with_predictions_injuries["Salary"]
    data_to_feed = pd.DataFrame(data = {'players': players, 'positions': positions_as_lists, 'values': values, 'costs': costs})
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    print(data_to_feed)
    optimal_lineup = [n.replace("_", " ") for n in generate_optimal_lineup_draftkings(players, positions_as_lists, values, costs, 50000)]
    lineups_100, players_in = build_n_lineups_draftkings(players, positions_as_lists, values, costs, budget = 50000, n = n)
    print(optimal_lineup)
    for i in range(len(optimal_lineup)):
        if optimal_lineup[i] in punctuation_names.keys():
            optimal_lineup[i] = punctuation_names[optimal_lineup[i]]
    draftkings_data_with_predictions_injuries["projected_value_draftkings"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["projected_value_draftkings"]]
    draftkings_data_with_predictions_injuries["Salary"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["Salary"]]
    projs_in_optimal = [(draftkings_data_with_predictions_injuries.loc[i, "Name"] in optimal_lineup) for i in draftkings_data_with_predictions_injuries.index]
    lineup_return = draftkings_data_with_predictions_injuries[projs_in_optimal].sort_values(by=["Position"], ascending = False)
    lineup_return = lineup_return.loc[:, ["Name", "Position", "Game Info", "TeamAbbrev", 'projected_points', 'projected_value', 'projected_points_draftkings', 'projected_value_draftkings', "Salary", "Value above Market Value", "AvgPointsPerGame", '10_game_average', '3_game_average', "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers','hot', 'cold']].fillna(" ")
    lineup_return.columns = ["Name", "Position", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', 'Projected Draftkings Points', 'Projected Draftkings Value', "Draftkings Salary", "Value above Draftkings Value", "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers','Hot', 'Cold']
    csv_return = draftkings_data_with_predictions.loc[:, ["Name", "Position", "Salary", "Game Info", "TeamAbbrev", 'projected_points', 'projected_value', "Value above Market Value", 'projected_points_draftkings', 'projected_value_draftkings', "AvgPointsPerGame", '10_game_average', '3_game_average', "minutes", "made_two_point_field_goals", 'made_three_point_field_goals',
       'made_free_throws', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers', "Opponent Defensive Rank vs Position", 'hot', 'cold']].fillna(" ")
    csv_return.columns = ["Name", "Position", "Salary", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position", 'Hot', 'Cold']
    csv_return_dfs = csv_return.loc[:, ["Name", "Position", "Salary", "Game", "Team", 'Projected Fanduel Points', 'Projected Fanduel Value', "Value above Fanduel Value", 'Projected Draftkings Points', 'Projected Draftkings Value', "FPPG", '10 Game Average (Fanduel)', '3 Game Average (Fanduel)', "Opponent Defensive Rank vs Position", 'Hot', 'Cold']]
    csv_return_projections = csv_return.loc[:, ["Name", "Position", "Game", "Team", "Minutes", "2PT FG", '3PT FG',
       'FTM', 'Rebounds', 'Assists', 'Blocks', 'Steals', 'Turnovers', "Opponent Defensive Rank vs Position"]]
    lineup_return = lineup_return.loc[:, ["Name", "Position", "Game", "Team", 'Projected Draftkings Points', 'Projected Draftkings Value', "Draftkings Salary", "Value above Draftkings Value", "FPPG", 'Hot', 'Cold']]
    
    fanduel_statlines = pd.read_csv("./AllCSVs/statline_projections_to_display_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    injury_indicators = []
    injury_details = []
    for i in lineup_return.index:
        print(lineup_return.loc[i, "Name"])
        for n in fanduel_statlines.index:
            if fanduel_statlines.loc[n, "Name"] in fd_to_dk.keys():
                nam = fd_to_dk[fanduel_statlines.loc[n, "Name"]]
                fanduel_statlines.loc[n, "Name"] = nam
            if fanduel_statlines.loc[n, "Name"] == lineup_return.loc[i, "Name"]:
                print(i)
                injury_indicators.append(fanduel_statlines.loc[n, "Injury Indicator"])
                injury_details.append(fanduel_statlines.loc[n, "Injury Details"])
            elif lineup_return.loc[i, "Name"] in punctuation_names.keys():
                if punctuation_names[lineup_return.loc[i, "Name"]] == fanduel_statline.loc[n, "Name"]:
                    print(i)
                    injury_indicators.append(fanduel_statlines.loc[n, "Injury Indicator"])
                    injury_details.append(fanduel_statlines.loc[n, "Injury Details"])
    lineup_return["Injury Indicator"] = injury_indicators
    lineup_return["Injury Details"] = injury_details
    
    lineup_return.to_csv("./AllCSVs/optimal_draftkings_lineup_" + str(game_date.month) + "_" + str(game_date.day) + "_" + str(game_date.year) + ".csv")
    return csv_return_dfs, csv_return_projections, lineup_return, lineups_100, players_in

gen_n_lineups_draftkings(datetime(2021, 4, 16), preds, dk_csv, players_out)[4]

Done predicting!
                      players positions  values  costs
0           Russell Westbrook      [PG]   57.10  11000
1                Nikola Jokic       [C]   46.49  10900
2                 Luka Doncic  [PG, SF]   45.78  10800
3          Karl-Anthony Towns       [C]   46.70  10700
4                James Harden      [PG]   30.06  10600
5                 Joel Embiid       [C]   57.52  10500
6             Zion Williamson   [PF, C]   44.88  10000
7                Kyrie Irving  [PG, SG]   40.13   9900
8               Kawhi Leonard      [SF]   47.44   9700
9              Nikola Vucevic       [C]   46.51   9700
10              Julius Randle      [PF]   58.54   9600
11             Damian Lillard      [PG]   45.64   9500
12               Kevin Durant      [PF]   49.58   9400
13               Jimmy Butler      [SF]   42.93   9200
14                Paul George      [SG]   45.26   9100
15               Bradley Beal      [SG]   41.29   8900
16         Kristaps Porzingis   [PF, C]   36.88 



['Carmelo Anthony', 'Grayson Allen', 'Ja Morant', 'James Ennis III', 'Joel Embiid', 'Julius Randle', 'Luguentz Dort', 'Patty Mills']
Luguentz Dort
76
Grayson Allen
120
James Ennis III
80
Patty Mills
175
Ja Morant
36
Julius Randle
10
Carmelo Anthony
147
Joel Embiid
5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  draftkings_data_with_predictions_injuries["projected_value_draftkings"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["projected_value_draftkings"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  draftkings_data_with_predictions_injuries["Salary"] = ['$' + str(s) for s in draftkings_data_with_predictions_injuries["Salary"]]


{'Anthony Edwards': 3,
 'Brandon Ingram': 1,
 'CJ McCollum': 6,
 'DeAndre Jordan': 5,
 'George Hill': 2,
 'Joel Embiid': 7,
 'Patty Mills': 4,
 'Trevor Ariza': 1,
 'Aaron Gordon': 38,
 'Carmelo Anthony': 9,
 'Nikola Vucevic': 1,
 'Reggie Jackson': 3,
 'Furkan Korkmaz': 7,
 'Kyle Lowry': 5,
 'Naz Reid': 1,
 'Russell Westbrook': 44,
 'Saddiq Bey': 2,
 'Al Horford': 29,
 "D'Angelo Russell": 47,
 'Davis Bertans': 28,
 'Lonzo Ball': 3,
 'Zion Williamson': 8,
 'Alex Len': 17,
 'Bruce Brown': 1,
 'Damian Lillard': 12,
 'Desmond Bane': 10,
 'Kendrick Nunn': 1,
 'Miles Bridges': 3,
 'Timothe Luwawu Cabarrot': 5,
 'JJ Redick': 5,
 'John Konchar': 24,
 'Saben Lee': 14,
 'Duncan Robinson': 1,
 'Maxi Kleber': 11,
 'Mitchell Robinson': 49,
 'Jimmy Butler': 81,
 'Frank Ntilikina': 9,
 'Paul George': 1,
 'Grayson Allen': 1,
 'Paul Watson': 16,
 'Jerami Grant': 63,
 'Cody Zeller': 1,
 'Luka Doncic': 23,
 'Malik Beasley': 11,
 'Willie Cauley Stein': 7,
 'Garrett Temple': 4,
 'Jakob Poeltl': 1,
 'Cody Ma

In [None]:
def regression_by_sample(start_date, end_date, sample_size, weight, per_min = False):
    """Fits models to random samples of training data on each of the output statistics.
    This function is useful for model selection, validation and testing, and for tuning the sample_size, weight
    and min_seconds hyperparameters. Fits several models to each output statistic using multiple different
    random train-test splits and displays model accuracy in r^2 score and MSE.
    
    Params:
    start_date: Datetime object representing date to begin collecting data from.
    end_date: Datetime object representing date to collect data until.
    sample_size: Integer representing length of sample window to produce weighted average statlines with.
    weight: Float representing weight to bias weighted average towards recent results.
    per_min: Boolean representing whether we are doing per minute regression.
    
    TODO:
        -Switch to FPPM modeling and adjust outputs to measure accuracy in per-minute modeling.
        -Test out some more models. Catboost, RidgeCV and GradientBoostingRegressor all seem to be fine
            options but within the FPPM paradigm we should be able to branch out.
    """
    
    # Here we follow the same procedure as in statline_output to generate inputs and outputs
    # suitable for modeling and prediction.
    
    full_df = box_scores_for_range_of_days(start_date, end_date)
    
    input_indices = [3, 7, 6, 9, 10, 12, 11, 13, 14, 15, 18, 16, 17, 22, 23, 24, 20, 25, 21, 26, 27]
    
    weighted_statlines = generate_input_vector(full_df, input_statistics, sample_size, weight)
    
    
    if per_min:
        for box_index in full_df.index:
            mins = full_df.loc[box_index, "seconds_played"]
            for j in full_df.columns[7:21]:
                if mins > 0:
                    full_df.loc[box_index, j] = full_df.loc[box_index, j]*60/full_df.loc[box_index, "seconds_played"]
                    
    print(full_df.head())
    print(full_df.columns)
    
    #weighted_statlines_by_min = generate_input_vector(full_df, input_statistics, sample_size, weight)
    weighted_statlines_to_keep = weighted_statlines
    df_to_keep = full_df
    df_to_keep["attempted_two_point_field_goals"] = df_to_keep["attempted_field_goals"] - df_to_keep["attempted_three_point_field_goals"]
    df_to_keep["made_two_point_field_goals"] = df_to_keep["made_field_goals"] - df_to_keep["made_three_point_field_goals"]
    weighted_statlines_to_keep['name_date'] = weighted_statlines_to_keep["name"] + weighted_statlines_to_keep["date"].astype(str)
    df_to_keep['name_date'] = df_to_keep["name"] + df_to_keep["Date"].astype(str)
    df_merged = weighted_statlines_to_keep.merge(df_to_keep, left_on = 'name_date', right_on = 'name_date')
    df_merged["rebounds_y"] = df_merged["offensive_rebounds_y"] + df_merged["defensive_rebounds_y"]
    df_merged["location_x"] = df_merged["location_x"] == "HOME"
    df_merged["location_y"] = df_merged["location_y"] == "HOME"
    print(df_merged.columns)
    df_merged["fantasy_points"] = [float(get_points(df_merged[df_merged["name_date"] == player_name])[0]) for player_name in df_merged["name_date"]]
    df_merged = df_merged[df_merged.Date.astype(str).str.contains("2020-02")]
    predictors = df_merged.iloc[:, input_indices]
    print(df_merged.columns[input_indices])
    print(df_merged.columns[~input_indices])
    print(len(df_merged.columns))
    print(df_merged.head())
    
    
    # Right now this just models seconds played. To get this to model the other output
    # statistics you can uncomment the rest of the indices in the list.
    
    for desired_output in [35, 38, 40, 44, 45, 46, 47, 48, 58, 59]:
        colname = df_merged.columns[desired_output]
        print(colname)
        y = df_merged.iloc[:,desired_output]
        
        pred_train, pred_test, y_train, y_test = train_test_split(predictors, y, test_size=0.1, random_state=85733)
        pred_train1, pred_test1, y_train1, y_test1 = train_test_split(predictors, y, test_size=0.1, random_state=433)
        pred_train2, pred_test2, y_train2, y_test2 = train_test_split(predictors, y, test_size=0.1, random_state=96323)
        pred_train3, pred_test3, y_train3, y_test3 = train_test_split(predictors, y, test_size=0.1, random_state=76243)
        pred_train4, pred_test4, y_train4, y_test4 = train_test_split(predictors, y, test_size=0.1, random_state=76343)
#         x_train, x_test = xgb.DMatrix(pred_train, label = y_train, enable_categorical= True), xgb.DMatrix(pred_test, label = y_test, enable_categorical= True)
#         x_train1, x_test1 = xgb.DMatrix(pred_train1, label = y_train1, enable_categorical = True), xgb.DMatrix(pred_test1, label = y_test1, enable_categorical= True)
#         x_train2, x_test2 = xgb.DMatrix(pred_train2, label = y_train2, enable_categorical = True), xgb.DMatrix(pred_test2, label = y_test2, enable_categorical= True)
#         x_train3, x_test3 = xgb.DMatrix(pred_train3, label = y_train3, enable_categorical = True), xgb.DMatrix(pred_test3, label = y_test3, enable_categorical= True)
#         x_train4, x_test4 = xgb.DMatrix(pred_train4, label = y_train4, enable_categorical = True), xgb.DMatrix(pred_test4, label = y_test4, enable_categorical= True)
#         x_param = {'eta': 0.25, 'max_depth': 5, 'objective': 'reg:gamma', 'eval_metric': 'mae'}
        
        y_train = y_train.astype(float)
        gaussian_model = GaussianProcessRegressor().fit(pred_train, y_train)
        decisiontree_model = DecisionTreeRegressor().fit(pred_train, y_train)
        sgd_model = SGDRegressor(loss = 'huber').fit(pred_train, y_train)
        ridge_model = RidgeCV().fit(pred_train, y_train)
#         x_model = xgb.train(x_param, x_train, 25)
        ard_model = ARDRegression(n_iter = 400).fit(pred_train, y_train)
        bayesian_model = BayesianRidge(n_iter = 400).fit(pred_train, y_train)
        neural_model = MLPRegressor(max_iter = 600).fit(pred_train, y_train)
        cat_model = CatBoostRegressor().fit(pred_train, y_train, cat_features = ["location_x"], verbose_eval = False)
        linear_model = GradientBoostingRegressor().fit(pred_train, y_train)
        print("fit 0")
        ridge_model1 = RidgeCV().fit(pred_train1, y_train1)
        y_train1 = y_train1.astype(float)
        gaussian_model1 = GaussianProcessRegressor().fit(pred_train1, y_train1)
        decisiontree_model1 = DecisionTreeRegressor().fit(pred_train1, y_train1)
        sgd_model1 = SGDRegressor(loss = 'huber').fit(pred_train1, y_train1)
#         x_model1 = xgb.train(x_param, x_train1, 25)
        ard_model1 = ARDRegression(n_iter = 400).fit(pred_train1, y_train1)
        bayesian_model1 = BayesianRidge(n_iter = 400).fit(pred_train1, y_train1)
        neural_model1 = MLPRegressor(max_iter = 600).fit(pred_train1, y_train1)
        cat_model1 = CatBoostRegressor().fit(pred_train1, y_train1, cat_features = ["location_x"], verbose_eval = False)
        linear_model1 = GradientBoostingRegressor().fit(pred_train1, y_train1)
        print("fit 1")
        ridge_model2 = RidgeCV().fit(pred_train2, y_train2)
        y_train2 = y_train2.astype(float)
        gaussian_model2 = GaussianProcessRegressor().fit(pred_train2, y_train2)
        decisiontree_model2 = DecisionTreeRegressor().fit(pred_train2, y_train2)
        sgd_model2 = SGDRegressor(loss = 'huber').fit(pred_train2, y_train2)
#         x_model2 = xgb.train(x_param, x_train2, 25)
        ard_model2 = ARDRegression(n_iter = 400).fit(pred_train2, y_train2)
        bayesian_model2 = BayesianRidge(n_iter = 400).fit(pred_train2, y_train2)
        neural_model2 = MLPRegressor(max_iter = 600).fit(pred_train2, y_train2)
        cat_model2 = CatBoostRegressor().fit(pred_train2, y_train2, cat_features = ["location_x"], verbose_eval = False)
        linear_model2 = GradientBoostingRegressor().fit(pred_train2, y_train2)
        print("fit 2")
        ridge_model3 = RidgeCV().fit(pred_train3, y_train3)
        y_train3 = y_train3.astype(float)
        gaussian_model3 = GaussianProcessRegressor().fit(pred_train3, y_train3)
        decisiontree_model3 = DecisionTreeRegressor().fit(pred_train3, y_train3)
        sgd_model3 = SGDRegressor(loss = 'huber').fit(pred_train3, y_train3)
#         x_model3 = xgb.train(x_param, x_train3, 25)
        ard_model3 = ARDRegression(n_iter = 400).fit(pred_train3, y_train3)
        bayesian_model3 = BayesianRidge(n_iter = 400).fit(pred_train3, y_train3)
        neural_model3 = MLPRegressor(max_iter = 600).fit(pred_train3, y_train3)
        cat_model3 = CatBoostRegressor().fit(pred_train3, y_train3, cat_features = ["location_x"], verbose_eval = False)
        linear_model3 = GradientBoostingRegressor().fit(pred_train3, y_train3)
        print("fit 3")
        ridge_model4 = RidgeCV().fit(pred_train4, y_train4)
        y_train4 = y_train4.astype(float)
        gaussian_model4 = GaussianProcessRegressor().fit(pred_train4, y_train4)
        decisiontree_model4 = DecisionTreeRegressor().fit(pred_train4, y_train4)
        sgd_model4 = SGDRegressor(loss = 'huber').fit(pred_train4, y_train4)
        ard_model4 = ARDRegression(n_iter = 400).fit(pred_train4, y_train4)
        bayesian_model4 = BayesianRidge(n_iter = 400).fit(pred_train4, y_train4)
        neural_model4 = MLPRegressor(max_iter = 600).fit(pred_train4, y_train4)
        cat_model4 = CatBoostRegressor().fit(pred_train4, y_train4, cat_features = ["location_x"], verbose_eval = False)
        linear_model4 = GradientBoostingRegressor().fit(pred_train4, y_train4)
        print("fit 4")
#         ridge_model5 = RidgeCV().fit(pred_train5, y_train5)
#         y_train5 = y_train5.astype(float)
#         cat_model5 = CatBoostRegressor().fit(pred_train5, y_train5, cat_features = ["location_x"], verbose_eval = False)
#         linear_model5 = GradientBoostingRegressor().fit(pred_train5, y_train5)
#         ridge_model6 = RidgeCV().fit(pred_train6, y_train6)
#         y_train6 = y_train6.astype(float)
#         cat_model6 = CatBoostRegressor().fit(pred_train6, y_train6, cat_features = ["location_x"], verbose_eval = False)
#         linear_model6 = GradientBoostingRegressor().fit(pred_train6, y_train6)
#         print(df_merged.columns[desired_output])
        
        
        y_test = y_test.astype(float)
        y_test1 = y_test1.astype(float)
        y_test2 = y_test2.astype(float)
        y_test3 = y_test3.astype(float)
        y_test4 = y_test4.astype(float)
#         y_test5 = y_test5.astype(float)
#         y_test6 = y_test6.astype(float)
        
        results_df = pd.DataFrame(columns=["Gaussian", "Decision Tree", "SGD", "RidgeCV", "ARD", "Bayesian", "Neural", "Cat", "Linear"])
        errors_model0 = [np.mean(abs(gaussian_model.predict(pred_test) - y_test)),
            np.mean(abs(decisiontree_model.predict(pred_test) - y_test)),
            np.mean(abs(sgd_model.predict(pred_test) - y_test)),
            np.mean(abs(ridge_model.predict(pred_test) - y_test)),
            np.mean(abs(ard_model.predict(pred_test) - y_test)),
            np.mean(abs(bayesian_model.predict(pred_test) - y_test)),
            np.mean(abs(neural_model.predict(pred_test) - y_test)),
            np.mean(abs(cat_model.predict(pred_test) - y_test)),
            np.mean(abs(linear_model.predict(pred_test) - y_test))]
        scores_model0 = [gaussian_model.score(pred_test, y_test),
            decisiontree_model.score(pred_test, y_test),
            sgd_model.score(pred_test, y_test),
            ridge_model.score(pred_test, y_test),
            ard_model.score(pred_test, y_test),
            bayesian_model.score(pred_test, y_test),
            neural_model.score(pred_test, y_test),
            cat_model.score(pred_test, y_test),
            linear_model.score(pred_test, y_test)]
        errors_model1 = [np.mean(abs(gaussian_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(decisiontree_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(sgd_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(ridge_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(ard_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(bayesian_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(neural_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(cat_model1.predict(pred_test1) - y_test1)),
            np.mean(abs(linear_model1.predict(pred_test1) - y_test1))]
        scores_model1 = [gaussian_model1.score(pred_test1, y_test1),
            decisiontree_model1.score(pred_test1, y_test1),
            sgd_model1.score(pred_test1, y_test1),
            ridge_model1.score(pred_test1, y_test1),
            ard_model1.score(pred_test1, y_test1),
            bayesian_model1.score(pred_test1, y_test1),
            neural_model1.score(pred_test1, y_test1),
            cat_model1.score(pred_test1, y_test1),
            linear_model1.score(pred_test1, y_test1)]
        errors_model2 = [np.mean(abs(gaussian_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(decisiontree_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(sgd_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(ridge_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(ard_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(bayesian_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(neural_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(cat_model2.predict(pred_test2) - y_test2)),
            np.mean(abs(linear_model2.predict(pred_test2) - y_test2))]
        scores_model2 = [gaussian_model2.score(pred_test2, y_test2),
            decisiontree_model2.score(pred_test2, y_test2),
            sgd_model2.score(pred_test2, y_test2),
            ridge_model2.score(pred_test2, y_test2),
            ard_model2.score(pred_test2, y_test2),
            bayesian_model2.score(pred_test2, y_test2),
            neural_model2.score(pred_test2, y_test2),
            cat_model2.score(pred_test2, y_test2),
            linear_model2.score(pred_test2, y_test2)]
        errors_model3 = [np.mean(abs(gaussian_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(decisiontree_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(sgd_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(ridge_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(ard_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(bayesian_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(neural_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(cat_model3.predict(pred_test3) - y_test3)),
            np.mean(abs(linear_model3.predict(pred_test3) - y_test3))]
        scores_model3 = [gaussian_model3.score(pred_test3, y_test3),
            decisiontree_model3.score(pred_test3, y_test3),
            sgd_model3.score(pred_test3, y_test3),
            ridge_model3.score(pred_test3, y_test3),
            ard_model3.score(pred_test3, y_test3),
            bayesian_model3.score(pred_test3, y_test3),
            neural_model3.score(pred_test3, y_test3),
            cat_model3.score(pred_test3, y_test3),
            linear_model3.score(pred_test3, y_test3)]
        errors_model4 = [np.mean(abs(gaussian_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(decisiontree_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(sgd_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(ridge_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(ard_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(bayesian_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(neural_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(cat_model4.predict(pred_test4) - y_test4)),
            np.mean(abs(linear_model4.predict(pred_test4) - y_test4))]
        scores_model4 = [gaussian_model4.score(pred_test4, y_test4),
            decisiontree_model4.score(pred_test4, y_test4),
            sgd_model4.score(pred_test4, y_test4),
            ridge_model4.score(pred_test4, y_test4),
            ard_model4.score(pred_test4, y_test4),
            bayesian_model4.score(pred_test4, y_test4),
            neural_model4.score(pred_test4, y_test4),
            cat_model4.score(pred_test4, y_test4),
            linear_model4.score(pred_test4, y_test4)]
        
        results_df = results_df.append(pd.Series(errors_model0, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(scores_model0, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(errors_model1, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(scores_model1, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(errors_model2, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(scores_model2, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(errors_model3, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(scores_model3, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(errors_model4, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(scores_model4, index = results_df.columns), ignore_index = True)
        

#         print(np.mean((ridge_model5.predict(pred_test5) - y_test5)**2))
#         print(np.mean((cat_model5.predict(pred_test5) - y_test5)**2))
#         print(np.mean((linear_model5.predict(pred_test5) - y_test5)**2))
#         print(ridge_model5.score(pred_test5, y_test5))
#         print(cat_model5.score(pred_test5, y_test5))
#         print(linear_model5.score(pred_test5, y_test5))
#         print(np.mean((ridge_model6.predict(pred_test6) - y_test6)**2))
#         print(np.mean((cat_model6.predict(pred_test6) - y_test6)**2))
#         print(np.mean((linear_model6.predict(pred_test6) - y_test6)**2))
#         print(ridge_model6.score(pred_test6, y_test6))
#         print(cat_model6.score(pred_test6, y_test6))
#         print(linear_model6.score(pred_test6, y_test6))

        mses = [np.mean([np.mean((gaussian_model.predict(pred_test) - y_test)**2), np.mean((gaussian_model1.predict(pred_test1) - y_test1)**2), np.mean((gaussian_model2.predict(pred_test2) - y_test2)**2), np.mean((gaussian_model3.predict(pred_test3) - y_test3)**2), np.mean((gaussian_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((decisiontree_model.predict(pred_test) - y_test)**2), np.mean((decisiontree_model1.predict(pred_test1) - y_test1)**2), np.mean((decisiontree_model2.predict(pred_test2) - y_test2)**2), np.mean((decisiontree_model3.predict(pred_test3) - y_test3)**2), np.mean((decisiontree_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((sgd_model.predict(pred_test) - y_test)**2), np.mean((sgd_model1.predict(pred_test1) - y_test1)**2), np.mean((sgd_model2.predict(pred_test2) - y_test2)**2), np.mean((sgd_model3.predict(pred_test3) - y_test3)**2), np.mean((sgd_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((ridge_model.predict(pred_test) - y_test)**2), np.mean((ridge_model1.predict(pred_test1) - y_test1)**2), np.mean((ridge_model2.predict(pred_test2) - y_test2)**2), np.mean((ridge_model3.predict(pred_test3) - y_test3)**2), np.mean((ridge_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((ard_model.predict(pred_test) - y_test)**2), np.mean((ard_model1.predict(pred_test1) - y_test1)**2), np.mean((ard_model2.predict(pred_test2) - y_test2)**2), np.mean((ard_model3.predict(pred_test3) - y_test3)**2), np.mean((ard_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((bayesian_model.predict(pred_test) - y_test)**2), np.mean((bayesian_model1.predict(pred_test1) - y_test1)**2), np.mean((bayesian_model2.predict(pred_test2) - y_test2)**2), np.mean((bayesian_model3.predict(pred_test3) - y_test3)**2), np.mean((bayesian_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((neural_model.predict(pred_test) - y_test)**2), np.mean((neural_model1.predict(pred_test1) - y_test1)**2), np.mean((neural_model2.predict(pred_test2) - y_test2)**2), np.mean((neural_model3.predict(pred_test3) - y_test3)**2), np.mean((neural_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((cat_model.predict(pred_test) - y_test)**2), np.mean((cat_model1.predict(pred_test1) - y_test1)**2), np.mean((cat_model2.predict(pred_test2) - y_test2)**2), np.mean((cat_model3.predict(pred_test3) - y_test3)**2), np.mean((neural_model4.predict(pred_test4) - y_test4)**2)]),
            np.mean([np.mean((linear_model.predict(pred_test) - y_test)**2), np.mean((linear_model1.predict(pred_test1) - y_test1)**2), np.mean((linear_model2.predict(pred_test2) - y_test2)**2), np.mean((linear_model3.predict(pred_test3) - y_test3)**2), np.mean((linear_model4.predict(pred_test4) - y_test4)**2)])]
        r2s = [np.mean([gaussian_model.score(pred_test, y_test), gaussian_model1.score(pred_test1, y_test1), gaussian_model2.score(pred_test2, y_test2), gaussian_model3.score(pred_test3, y_test3), gaussian_model4.score(pred_test4, y_test4)]),
            np.mean([decisiontree_model.score(pred_test, y_test), decisiontree_model1.score(pred_test1, y_test1), decisiontree_model2.score(pred_test2, y_test2), decisiontree_model3.score(pred_test3, y_test3), decisiontree_model4.score(pred_test4, y_test4)]),
            np.mean([sgd_model.score(pred_test, y_test), sgd_model1.score(pred_test1, y_test1), sgd_model2.score(pred_test2, y_test2), sgd_model3.score(pred_test3, y_test3), sgd_model4.score(pred_test4, y_test4)]),
            np.mean([ridge_model.score(pred_test, y_test), ridge_model1.score(pred_test1, y_test1), ridge_model2.score(pred_test2, y_test2), ridge_model3.score(pred_test3, y_test3), ridge_model4.score(pred_test4, y_test4)]),
            np.mean([ard_model.score(pred_test, y_test), ard_model1.score(pred_test1, y_test1), ard_model2.score(pred_test2, y_test2), ard_model3.score(pred_test3, y_test3), ard_model4.score(pred_test4, y_test4)]),
            np.mean([bayesian_model.score(pred_test, y_test), bayesian_model1.score(pred_test1, y_test1), bayesian_model2.score(pred_test2, y_test2), bayesian_model3.score(pred_test3, y_test3), bayesian_model4.score(pred_test4, y_test4)]),
            np.mean([neural_model.score(pred_test, y_test), neural_model1.score(pred_test1, y_test1), neural_model2.score(pred_test2, y_test2), neural_model3.score(pred_test3, y_test3), neural_model4.score(pred_test4, y_test4)]),
            np.mean([cat_model.score(pred_test, y_test), cat_model1.score(pred_test1, y_test1), cat_model2.score(pred_test2, y_test2), cat_model3.score(pred_test3, y_test3), cat_model4.score(pred_test4, y_test4)]),
            np.mean([linear_model.score(pred_test, y_test), linear_model1.score(pred_test1, y_test1), linear_model2.score(pred_test2, y_test2), linear_model3.score(pred_test3, y_test3), linear_model4.score(pred_test4, y_test4)])]
        
        results_df = results_df.append(pd.Series(mses, index = results_df.columns), ignore_index = True)
        results_df = results_df.append(pd.Series(r2s, index = results_df.columns), ignore_index = True)
        
        print(results_df)
        
        output_filename = "./OutputCSVs/model_testing_" + colname + "_" + str(sample_size) + "_" + datetime.now().strftime("%m_%d_%Y") + ".csv"
        results_df.to_csv(output_filename)
    return results_df

     
# Here we run cross-validation by testing out combinations of hyperparameters.


for i in [6, 7, 8, 9, 10]:
         regression_by_sample(datetime(2018, 11, 2), datetime(2021, 1, 30), i, .85, True)

In [None]:
# A bunch of plots that were useful at one point.






#july_30_predictions.groupby(["team"]).sum()
#july_30_predictions[july_30_predictions["team"] == "LOS ANGELES CLIPPERS"]
#sns.distplot(jan_25_pred_actual.predicted_points)
#sns.distplot(jan_25_pred_actual.actual_points)
#sns.jointplot(data = jan_25_pred_actual, x = "predicted_points", y = "actual_points", kind = 'reg')
#jan_24_pred_actual["blended_prediction"] = .7*jan_24_pred_actual["recent_average"] + .3*jan_24_pred_actual["predicted_points"]
#feb_12_pred_actual_sig = feb_12_pred_actual[feb_12_pred_actual["minutes"] <= 19]

#print(r2_score(feb_25_pred_actual.fantasy_points_8_9, feb_25_pred_actual.actual_points), r2_score(feb_25_pred_actual.predicted_points, feb_25_pred_actual.actual_points))
#print(r2_score(feb_26_pred_actual.fantasy_points_8_9, feb_26_pred_actual.actual_points), r2_score(feb_26_pred_actual.predicted_points, feb_26_pred_actual.actual_points))
#print(r2_score(feb_27_pred_actual.fantasy_points_8_9, feb_27_pred_actual.actual_points), r2_score(feb_27_pred_actual.predicted_points, feb_27_pred_actual.actual_points))
#print(r2_score(feb_28_pred_actual.fantasy_points_8_9, feb_28_pred_actual.actual_points), r2_score(feb_28_pred_actual.predicted_points, feb_28_pred_actual.actual_points))

#sns.jointplot(data = mar_7_pred_actual, x = "fantasy_points_8_9", y = "predicted_points", kind = 'reg')

# print(len(july_30_predictions[july_30_predictions["hot"] > 0])/len(july_30_predictions))
# print(len(july_30_predictions[july_30_predictions["cold"] > 0])/len(july_30_predictions))
#sns.distplot(july_31_predictions.hot)
# july_30_predictions[july_30_predictions["hot"] > 0]
#july_31_predictions[july_31_predictions["cold"] > 0]

# print(r2_score(mar_1_pred_actual.fantasy_points_7_9, mar_1_pred_actual.actual_points), r2_score(mar_1_pred_actual.fantasy_points_8_8, mar_1_pred_actual.actual_points), r2_score(mar_1_pred_actual.fantasy_points_8_85, mar_1_pred_actual.actual_points), r2_score(mar_1_pred_actual.fantasy_points_8_9, mar_1_pred_actual.actual_points), r2_score(mar_1_pred_actual.predicted_points, mar_1_pred_actual.actual_points))
# print(r2_score(mar_2_pred_actual.fantasy_points_7_9, mar_2_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_8, mar_2_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_85, mar_2_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_9, mar_2_pred_actual.actual_points), r2_score(mar_2_pred_actual.predicted_points, mar_2_pred_actual.actual_points))
# print(r2_score(mar_3_pred_actual.fantasy_points_7_9, mar_3_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_8, mar_3_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_85, mar_3_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_9, mar_3_pred_actual.actual_points), r2_score(mar_3_pred_actual.predicted_points, mar_3_pred_actual.actual_points))
# print(r2_score(mar_4_pred_actual.fantasy_points_7_9, mar_4_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_8, mar_4_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_85, mar_4_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_9, mar_4_pred_actual.actual_points), r2_score(mar_4_pred_actual.predicted_points, mar_4_pred_actual.actual_points))
# print(r2_score(mar_5_pred_actual.fantasy_points_7_9, mar_5_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_8, mar_5_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_85, mar_5_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_9, mar_5_pred_actual.actual_points), r2_score(mar_5_pred_actual.predicted_points, mar_5_pred_actual.actual_points))
# print(r2_score(mar_6_pred_actual.fantasy_points_7_9, mar_6_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_8, mar_6_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_85, mar_6_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_9, mar_6_pred_actual.actual_points), r2_score(mar_6_pred_actual.predicted_points, mar_6_pred_actual.actual_points))
# print(r2_score(mar_7_pred_actual.fantasy_points_7_9, mar_7_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_8, mar_7_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_85, mar_7_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_9, mar_7_pred_actual.actual_points), r2_score(mar_7_pred_actual.predicted_points, mar_7_pred_actual.actual_points))
# print(r2_score(mar_8_pred_actual.fantasy_points_7_9, mar_8_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_8, mar_8_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_85, mar_8_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_9, mar_8_pred_actual.actual_points), r2_score(mar_8_pred_actual.predicted_points, mar_8_pred_actual.actual_points))
# print(r2_score(mar_9_pred_actual.fantasy_points_7_9, mar_9_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_8, mar_9_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_85, mar_9_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_9, mar_9_pred_actual.actual_points), r2_score(mar_9_pred_actual.predicted_points, mar_9_pred_actual.actual_points))
# print(r2_score(mar_10_pred_actual.fantasy_points_7_9, mar_10_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_8, mar_10_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_85, mar_10_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_9, mar_10_pred_actual.actual_points), r2_score(mar_10_pred_actual.predicted_points, mar_10_pred_actual.actual_points))
# print(np.mean([r2_score(mar_1_pred_actual.fantasy_points_7_9, mar_1_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_7_9, mar_2_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_7_9, mar_3_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_7_9, mar_4_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_7_9, mar_5_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_7_9, mar_6_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_7_9, mar_7_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_7_9, mar_8_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_7_9, mar_9_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_7_9, mar_10_pred_actual.actual_points)]))
# print(np.mean([r2_score(mar_1_pred_actual.fantasy_points_8_8, mar_1_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_8, mar_2_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_8, mar_3_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_8, mar_4_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_8, mar_5_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_8, mar_6_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_8, mar_7_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_8, mar_8_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_8, mar_9_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_8, mar_10_pred_actual.actual_points)]))
# print(np.mean([r2_score(mar_1_pred_actual.fantasy_points_8_85, mar_1_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_85, mar_2_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_85, mar_3_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_85, mar_4_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_85, mar_5_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_85, mar_6_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_85, mar_7_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_85, mar_8_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_85, mar_9_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_85, mar_10_pred_actual.actual_points)]))
# print(np.mean([r2_score(mar_1_pred_actual.fantasy_points_8_9, mar_1_pred_actual.actual_points), r2_score(mar_2_pred_actual.fantasy_points_8_9, mar_2_pred_actual.actual_points), r2_score(mar_3_pred_actual.fantasy_points_8_9, mar_3_pred_actual.actual_points), r2_score(mar_4_pred_actual.fantasy_points_8_9, mar_4_pred_actual.actual_points), r2_score(mar_5_pred_actual.fantasy_points_8_9, mar_5_pred_actual.actual_points), r2_score(mar_6_pred_actual.fantasy_points_8_9, mar_6_pred_actual.actual_points), r2_score(mar_7_pred_actual.fantasy_points_8_9, mar_7_pred_actual.actual_points), r2_score(mar_8_pred_actual.fantasy_points_8_9, mar_8_pred_actual.actual_points), r2_score(mar_9_pred_actual.fantasy_points_8_9, mar_9_pred_actual.actual_points), r2_score(mar_10_pred_actual.fantasy_points_8_9, mar_10_pred_actual.actual_points)]))
# print(np.mean([r2_score(mar_1_pred_actual.predicted_points, mar_1_pred_actual.actual_points), r2_score(mar_2_pred_actual.predicted_points, mar_2_pred_actual.actual_points), r2_score(mar_3_pred_actual.predicted_points, mar_3_pred_actual.actual_points), r2_score(mar_4_pred_actual.predicted_points, mar_4_pred_actual.actual_points), r2_score(mar_5_pred_actual.predicted_points, mar_5_pred_actual.actual_points), r2_score(mar_6_pred_actual.predicted_points, mar_6_pred_actual.actual_points), r2_score(mar_7_pred_actual.predicted_points, mar_7_pred_actual.actual_points), r2_score(mar_8_pred_actual.predicted_points, mar_8_pred_actual.actual_points), r2_score(mar_9_pred_actual.predicted_points, mar_9_pred_actual.actual_points), r2_score(mar_10_pred_actual.predicted_points, mar_10_pred_actual.actual_points)]))
#np.mean(jan_24_pred_actual["recent_average"]), np.mean(jan_24_pred_actual["predicted_points"]), np.mean(jan_24_pred_actual["actual_points"])
#r2_score(july_31_predictions["recent_average"], july_31_predictions["projected_points"])
#r2_score(jan_24_pred_actual["recent_average"], jan_24_pred_actual["predicted_points"])

In [None]:
# Basically a garbage cell.
# This didn't really work but could be useful for manually inputting minutes.





unlisted_players = {"Jamal Crawford": 17,
               "Zach Collins": 18,
               "Al-Farouq Aminu": 16,
               "Frank Kaminsky": 19,
               "Tyler Zeller": 10,
               "Jerian Grant": 12,
               "Josh Reaves": 1,
               "Trey Burke": 7,
               "Jonathan Isaac": 19,
               "Cameron Payne": 12,
               "Corey Brewer": 11,
               "Jaylen Adams": 9,
               "Jontay Porter": 1,
               "Kyle Guy": 2,
               "Grayson Allen": 12,
               "Nicolo Melli": 11.2,
               "JR Smith": 15.8,
               "Joakim Noah": 1.9,
               "Kenrich Williams":  8,
               "Dion Waiters": 8.3,
               "Talen Horton-Tucker": 4.2,
               "Nigel Williams-Goss": 1.2,
               "Devontae Cacok": 0.7,
               "Zylan Cheatham": 1.0,
               "Kostas Antetokounmpo":  0.4,
               "Jusuf Nurkic": 24,
               "Donta Hall": 4,
               "Michael Beasley": 6.5,
               "Lance Thomas": 11,
               "Sindarius Thornwell": 5,
               "Naz Mitrou-Long": 13,
               "Bol Bol": 14,
               "Keita Bates-Diop": 10,
               "Tyler Cook": 3,
               "Troy Daniels":  8,
               "Kyle Alexander": 1,
               "Meyers Leonard": 1,
               "KZ Okpala": 5,
               "Darius Bazley": 14,
               "Devon Hall": 4,
               "Andre Roberson": 5,
               "Marial Shayok": 3,
               "Dewan Hernandez": 4,
                   "Kawhi Leonard": 36.9,
                   "LeBron James": 35.5,
                   "Anthony Davis": 35.2,
                   "Stephen Curry": 34.6,
                   "Paul George": 34.5,
                   "Kyrie Irving": 34.1,
                   "Eric Paschall": 33.6,
                   "Kelly Oubre": 31.6,
                   "Andrew Wiggins": 31,
                   "Dennis Schroder": 30.7,
                   "Kevin Durant": 27.6,
                   "Marcus Morris": 27.6,
                   "Montrezl Harrell": 27.4,
                   "Serge Ibaka": 27.4,
                   "Caris LeVert": 26.8,
                   "Lou Williams": 26.5,
                   "Luke Kennard": 25.8,
                   "Patrick Beverley": 25.4,
                   "Kentavious Caldwell-Pope": 25.2,
                   "Spencer Dinwiddie": 25,
                   "Taurean Prince": 24.6,
                   "DeAndre Jordan": 24.6,
                   "Joe Harris": 24.4,
                   "Marc Gasol": 23.7,
                   "Jarrett Allen": 22.7,
                   "Kyle Kuzma": 22.2,
                   "Wesley Matthews": 22,
                   "Damion Lee": 20.7,
                   "Kent Bazemore": 20.7,
                   "Ivica Zubac": 18.3,
                   "Landry Shamet": 18.3,
                   "Kevon Looney": 18.1,
                   "James Wiseman": 17,
                   "Marquese Chriss": 16.4,
                   "Alex Caruso": 16,
                   "Jeff Green": 14.7,
                   "Jordan Poole": 12,
                   "Reggie Jackson": 8.7,
                   "Nicolas Batum": 7.9,
                   "Alen Smailagic": 3.1,
                   "Talen Horton-Tucker": 2.6,
                   "Brad Wanamaker": 1.5,
                   "Terance Mann": .7,
                   "Bruce Brown": .5,
                   "Timothe Luwawu-Cabarrot": .3,
                   "Rodions Kurucs": .3,
                   "Tyler Johnson": 0,
                   "Patrick Patterson": 0,
                   "Mychal Mulder": 0,
                   "Chris Chiozza": 0,
                   "Jared Dudley": 0,
                   "Quinn Cook": 0,
                   "Markieff Morris": 0,
                   "Nico Mannion":, 0}

def translate_stats_to_game(input_data, player_dict):
    for player_index in input_data.index:
        minutes = player_dict[input_data.loc[player_index, "Name"]]
        twopt_guess = minutes*position_guess["2PT FG"]
        threept_guess = minutes*position_guess["3PT FG"]
        ft_guess = minutes*position_guess["FTM"]
        rebounds_guess = minutes*position_guess["Rebounds"]
        assists_guess = minutes*position_guess["Assists"]
        blocks_guess = minutes*position_guess["Blocks"]
        steals_guess = minutes*position_guess["Steals"]
        tov_guess = minutes*position_guess["Turnovers"]
        fd_guess = threept_guess * 3 + twopt_guess * 2 + ft_guess + rebounds_guess * 1.2 + assists_guess * 1.5 + blocks_guess * 3 + steals_guess * 3 - tov_guess
        dk_guess = threept_guess * 3.5 + twopt_guess * 2 + ft_guess + rebounds_guess * 1.25 + assists_guess * 1.5 + blocks_guess * 2 + steals_guess * 2 - .5 * tov_guess + 1.5 * double_double(threept_guess, twopt_guess, ft_guess, rebounds_guess, assists_guess) + 3 * triple_double(threept_guess, twopt_guess, ft_guess, rebounds_guess, assists_guess)
        new_row = [0, player, 0, 0, 0, 0, minutes_guess, twopt_guess, threept_guess, ft_guess, rebounds_guess, assists_guess, blocks_guess, steals_guess, tov_guess, 0, 0, 0, 0, 0, 0, 0, 
                          0, 0, 0, fd_guess, dk_guess, fd_guess*200, dk_guess*187.5]
        input_data = input_data.append(new_row)
    return input_data

#add_unlisted_players(pd.read_csv("./AllCSVs/predictions_for_07_30_2020_unplayed.csv"), unlisted_players)