In [1]:
import pandas as pd
from datetime import datetime

In [3]:
# create dataframe of past salaries and past performance
fantasy_df = pd.read_csv("../Data/testdata3.csv")

# Print the first five rows of data to the screen
fantasy_df.head()

Unnamed: 0,Name,GID,Date,Team,Opp,Home,GTime(ET),Team pts,Opp pts,Win-Lose,...,STL,BLK,TOV,3P,FGM,FGA,FG%,FTM,FTA,FT%
0,Drew Eubanks,5679,01/10/20,sas,mem,0,20.0,121,134,0,...,0,0,0,0,0,0,0.0%,2,2,100.0%
1,Drew Eubanks,5679,01/08/20,sas,bos,0,19.0,129,114,1,...,0,0,0,0,1,1,,0,0,
2,Drew Eubanks,5679,12/28/19,sas,det,1,20.5,136,109,1,...,0,0,0,0,3,3,,0,0,
3,Drew Eubanks,5679,12/03/19,sas,hou,1,20.5,135,133,1,...,0,0,1,0,1,1,,0,0,
4,Drew Eubanks,5679,12/01/19,sas,det,0,17.0,98,132,0,...,0,1,2,0,4,6,66.7%,1,1,100.0%


In [4]:
fantasy_df.set_index('Date', inplace=True)

In [5]:
fantasy_df.columns

Index(['Name', 'GID', 'Team', 'Opp', 'Home', 'GTime(ET)', 'Team pts',
       'Opp pts', 'Win-Lose', 'Start', 'MP', 'FDP', 'DD', 'TD', 'FD Sal',
       'FD pos', 'pos', 'ADI', 'VMI', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       '3P', 'FGM', 'FGA', 'FG%', 'FTM', 'FTA', 'FT%'],
      dtype='object')

##Data Integration

We will split the allplayerGameLogs into training and test sets. And we need to use two helpful functions to extract player-level and team-level features to train the prediction model for fantasy points from a player. We also need another helpful function to aggregate corresponding information from the test set for the testing purpose.

In [23]:
def aggr(group):
    test_df = pd.DataFrame()    
    test_df['LastFDP'] = group['FDP'][-1:]
    test_df['AvgFDP'] = group['FDP'].mean()
    test_df['AvgPTS'] = group['PTS'].mean()
    test_df['LastPT'] = group['PTS'][-1:]
    test_df['AvgMIN'] = group['MIN'].mean()
    test_df['LastMIN'] = group['MIN'][-1:]
    test_df['AvgFGM'] = group['FGM'].mean()
    test_df['LastFGM'] = group['FGM'][-1:]
    test_df['AvgFGA'] = group['FGA'].mean()
    test_df['LastFGA'] = group['FGA'][-1:]
    test_df['Avg3PTS'] = group['3P'].mean()
    test_df['Last3PTS'] = group['3P'][-1:]
    test_df['AvgREB'] = group['TRB'].mean()
    test_df['LastREB'] = group['TRB'][-1:]
    test_df['AvgAST'] = group['AST'].mean()
    test_df['LastAST'] = group['AST'][-1:]
    test_df['AvgSTL'] = group['STL'].mean()
    test_df['AvgTOV'] = group['TOV'].mean() 
    test_df['LastTOV'] = group['TOV'][-1:]
    #group['NumDouBL'] = group['DouBL'].sum()
    #group['NumTriBL'] = group['TriBL'].sum()

    test_df['Last3GameAvgFDP'] = group['FDP'][-3:].mean()
    test_df['Last3GameAvgMIN'] = group['MIN'][-3:].mean()
    test_df['Last3GameAvgPTS'] = group['PTS'][-3:].mean()
    
    num_team = len(group['Team'].unique())
    if(num_team==1):
        test_df['Name'] = group['Name'].unique()
        test_df['Player_ID'] = group['GID'].unique()
        test_df['Team'] = group['Team'].unique()[0]
        test_df['position'] = group['pos'].unique()[0]
    else:
        test_df['Name'] = group['Name'].unique()
        test_df['Player_ID'] = group['Player_ID'].unique()
        test_df['Team'] = group['Team'].unique()[num_team-1]
        test_df['position'] = group['pos'].unique()       
    
    return(test_df)
    
def aggr_stats(date,fantasy_df):
    interest_columns = ['Name','GID','Team','pos','MIN','PTS','FGM','FGA', '3P', \
                        'TRB','AST','STL','TOV','DD','TD','FDP']
    tmp = fantasy_df.ix['2019-10-22':date]
    
    playerID_tmp = tmp.reset_index().copy()
    tmp.grouped = playerID_tmp[interest_columns].groupby('GID')
    Newdf = pd.DataFrame()
    ids = playerID_tmp['GID'].unique()
    
    for id in ids:
        group = tmp.grouped.get_group(id)
        df = aggr(group)
        Newdf = pd.concat([Newdf,df],axis=0)
    
    bins = [-10, 10, 20, 30, 40, 100]
    group_names = ['benchPlayer','belowAvg','average','advanced','top']
    Newdf['Rank']= pd.cut(Newdf['AvgFDP'],bins,labels=group_names)
    
    return(Newdf)

NameError: name 'Newdf' is not defined

In [17]:
def aggr_teamVSteam(group):
        group['TeamStdVSFDP'] = group['FDP'].std()
        group['TeamAvgVSFDP'] = group['FDP'].mean()
        group['TeamMaxVSFDP'] = group['FDP'].max()
        return group

def aggr_team(group):
        group['TeamStdFDP'] = group['TeamStdVSFDP'].mean()
        group['TeamAvgFDP'] = group['TeamAvgVSFDP'].mean()
        group['TeamMaxFDP'] = group['TeamMaxVSFDP'].mean()
        return group    

def generate_team_features(playerGameLogs, playerFeatureTable, date):
    tmp = playerGameLogs['2019-10-27': date]
    tmp = tmp.reset_index()
    bad_players = playerFeatureTable[playerFeatureTable.Rank=='benchPlayer']['GID']
    interest_cols = ['Name','GID','Team','Opp','pos','FDP','MIN']
    tmp = tmp[interest_cols]
    tmp = tmp[~tmp['GID'].isin(bad_players)]
    
    newdf = tmp.copy()
    newdf_grouped = newdf.groupby(['Team','Opp'])
        
    Newdf = newdf_grouped.apply(aggr_teamVSteam)
    Newdf.drop(['Name','GID','MIN','FDP','pos'],inplace=True,axis=1)
    Newdf.drop_duplicates(['Team','Opp'],inplace=True)
    
    Newdf.drop('Opp',axis=1,inplace=True)
    
    Newdf2 = Newdf.copy()
    Newdf2_grouped = Newdf2.groupby('Team')
    
    Newdf_overall = Newdf2_grouped.apply(aggr_team)
    Newdf_overall.drop(['TeamStdVSFDP','TeamAvgVSFDP','TeamMaxVSFDP'],inplace=True,axis=1)
    Newdf_overall.drop_duplicates('Team',inplace=True)
    
    return(Newdf_overall)

In [18]:
def drop_y(df):
    # list comprehension of the cols that end with '_y'
    to_drop = [x for x in df if x.endswith('_y')]
    df.drop(to_drop, axis=1, inplace=True)

def rename_x(df):
    for col in df:
        if col.endswith('_x'):
            df.rename(columns={col:col.rstrip('_x')}, inplace=True)

In [25]:
def get_train_test(train_date, test_date): #format like'2/10/2019'
    train_date_index = pd.date_range(start='11/10/2019', end=train_date, freq='D')
    train_df = pd.DataFrame()
    
    alldates = fantasy_df.index
    trydates = pd.date_range(start='10/22/2019', end='2/02/2020', freq='D')
    s = set(alldates)
    nodates = [x for x in trydates if x not in s]
    
    for idx in train_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            trainLogs = fantasy_df.ix['2019-10-22':idx]
            train_player_df = aggr_stats(idx,trainLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = fantasy_df[['Name', 'GID','Team','Opp','Home','FDP']].ix[next_date]
            tmpLogs.rename(columns={'FDP':'NewGameFDP'},inplace=True)
            #join the tmpLogs and player festure table by GID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='GID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            train_team_df = generate_team_features(fantasy_df, train_player_df, idx)
            newgame_df = pd.merge(newgame_df,train_team_df,how='left',on='Team')
            train_df = pd.concat([train_df,newgame_df],axis=0)

    test_date_index = pd.date_range(start=train_date, end=test_date, freq='D')[1:]
    start_test_date = pd.date_range(start=train_date, end=test_date, freq='D')[0]
    test_df = pd.DataFrame()
    for idx in test_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            testLogs = fantasy_df.ix[start_test_date:idx]
            test_player_df = aggr_stats(idx,testLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = fantasy_df[['Name', 'GID','Team','Opp','Home','FDP']].ix[next_date]
            tmpLogs.rename(columns={'FDP':'NewGameFDP'},inplace=True)
            #join the tmpLogs and player festure table by GID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='GID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            test_team_df = generate_team_features(fantasy_df, test_player_df, idx)
            newgame_df = pd.merge(newgame_df,test_team_df,how='left',on='Team')
            test_df = pd.concat([test_df,newgame_df],axis=0)                
        
    return(train_df, test_df)


In [26]:
train_set, test_set = get_train_test('1/18/2020', '1/31/2020')

  # This is added back by InteractiveShellApp.init_path()


In [27]:
train_set.shape

(0, 0)

In [22]:
test_set.shape

(0, 0)

In [14]:
train_set.head(5).transpose()

In [13]:
train_set = train_set[train_set.Rank!='benchPlayer']

AttributeError: 'DataFrame' object has no attribute 'Rank'

In [100]:
test_set = test_set[test_set.Rank!='benchPlayer']

In [101]:
with open('../Data/train_set_02_29.pickle', 'wb') as handle:
  pickle.dump(train_set, handle)

In [102]:
with open('../Data/test_set_02_29.pickle', 'wb') as handle:
  pickle.dump(test_set, handle)

##Prepocessing

In [90]:
with open('../Data/train_set_02_29.pickle', 'rb') as handle:
  train_set = pickle.load(handle)

In [91]:
with open('../Data/test_set_02_29.pickle', 'rb') as handle:
  test_set = pickle.load(handle)

In [67]:
test_set.shape

(1511, 39)

In [92]:
train_set.shape

(14503, 39)

In [82]:
#Combine into data:
train_set['source']= 'train'
test_set['source'] = 'test'
data=pd.concat([train_set, test_set],ignore_index=True)
data.shape

(16014, 40)

####Note that in our problem, we don't care about the benchplayers. So we don't have to train on those data, which might bias our prediction.

In [83]:
data = data[data.Rank!='benchPlayer']

In [84]:
data.shape

(12902, 40)

###Check Missing values

In [78]:
data.apply(lambda x: sum(x.isnull()))

fullName              0
Player_ID             0
Team                  0
OpponentTeam          0
HomeGame              0
NewGameFanPTs         0
LastFanPTs            0
AvgFanPTs             0
AvgPTS                0
LastPT                0
AvgMIN                0
LastMIN               0
AvgFGM                0
LastFGM               0
AvgFGA                0
LastFGA               0
AvgFG3M               0
LastFG3M              0
AvgFG3A               0
LastFG3A              0
AvgREB                0
LastREB               0
AvgAST                0
LastAST               0
AvgSTL                0
AvgTOV                0
LastTOV               0
AvgPF                 0
LastPF                0
AvgPLUS_MINUS         0
LastPLUS_MINUS        0
Last3GameAvgFanPTs    0
Last3GameAvgMIN       0
Last3GameAvgPTS       0
position1             0
Rank                  0
TeamStdFanPTs         0
TeamAvgFanPTs         0
TeamMaxFanPTs         0
source                0
dtype: int64

###Look at categories of all object variables

In [79]:
var = ['Team','OpponentTeam', 'position1','Rank']
for v in var:
    print '\nFrequency count for variable %s'%v
    print data[v].value_counts()


Frequency count for variable Team
PHI    528
DAL    497
NYK    493
LAL    475
HOU    470
DEN    465
SAS    457
POR    451
NOP    449
DET    445
GSW    443
ORL    432
IND    430
BOS    427
MIN    426
MEM    424
TOR    424
WAS    423
PHX    418
CHI    412
CLE    410
ATL    408
LAC    407
CHA    405
BKN    399
SAC    395
OKC    377
UTA    375
MIA    374
MIL    363
Name: Team, dtype: int64

Frequency count for variable OpponentTeam
BOS    458
BKN    457
DET    450
MIN    449
MIL    446
LAL    444
POR    443
PHI    442
DAL    442
PHX    442
NYK    436
UTA    435
IND    435
CHA    434
DEN    432
ATL    429
MEM    429
HOU    427
WAS    426
SAS    425
CLE    424
OKC    416
NOP    415
LAC    414
SAC    411
TOR    411
GSW    409
ORL    408
MIA    407
CHI    406
Name: OpponentTeam, dtype: int64

Frequency count for variable position1
PG    2853
SG    2739
PF    2645
C     2407
SF    2258
Name: position1, dtype: int64

Frequency count for variable Rank
belowAvg    5991
average     4161
advanced  

###One-Hot Encoding

In [85]:
var_to_encode = ['Team','OpponentTeam','position1','HomeGame','Rank']
data = pd.get_dummies(data, columns=var_to_encode)
data.columns

Index([     u'fullName',     u'Player_ID', u'NewGameFanPTs',    u'LastFanPTs',
           u'AvgFanPTs',        u'AvgPTS',        u'LastPT',        u'AvgMIN',
             u'LastMIN',        u'AvgFGM',
       ...
        u'position1_PF',  u'position1_PG',  u'position1_SF',  u'position1_SG',
          u'HomeGame_0',    u'HomeGame_1', u'Rank_advanced',  u'Rank_average',
       u'Rank_belowAvg',      u'Rank_top'],
      dtype='object', length=106)

In [86]:
data.head(5).transpose()

Unnamed: 0,0,1,2,4,5
fullName,Jamal Crawford,DeAndre Jordan,JJ Hickson,Mike Conley,Marc Gasol
Player_ID,2037,201599,201581,201144,201188
NewGameFanPTs,12.5,23.75,19.5,24,42.75
LastFanPTs,15.25,38.5,44.25,32.5,29.25
AvgFanPTs,15.6071,36.8214,21.35,28.5938,26.625
AvgPTS,10,10.1429,10,13.75,13.25
LastPT,13,13,19,16,18
AvgMIN,20.8571,32.1429,18,31,30
LastMIN,20,36,30,35,37
AvgFGM,3.28571,4.14286,4.2,4.625,4.625


###Separate train & test:

In [87]:
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

In [88]:
train.drop('source',axis=1,inplace=True)
test.drop('source',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [89]:
train.to_csv('../Data/train_modified_0229.csv',index=False)
test.to_csv('../Data/test_modified_0229.csv',index=False)