# CS 109 Final Project: Breaking Daily Fantasy Basketball

![alt text](http://cdn.playbuzz.com/cdn/b83ad51b-f33e-4b06-879a-8b3f8a509b3e/9f71ec8b-2e9b-4bd4-80ff-ce0f555c5653.jpg)

### Hundreds of Competitions and Millions of Dollars a Day
####Overview and Motivation
####Related Work
####Initial Questions
####Exploratory Data Analysis
####Final Analysis

In [1]:
#load gamelog_data
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import json
import requests
import datetime

def season_subset(df, year_season_start, year_season_end = None):
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
    if year_season_end is None:
        year_season_end = year_season_start + 1
    df_gt = df[df.GAME_DATE > datetime.date(year_season_start,9,1)]
    df_lt = df_gt[df_gt.GAME_DATE < datetime.date(year_season_end,9,1)]
    return df_lt.sort_values("GAME_DATE") if not df_lt.empty else None

In [10]:
post85df = pd.read_csv('./gamelogs/master_post86df.csv')
post85df = post85df.drop('VIDEO_AVAILABLE',1)
df85_15 = season_subset(post85df,1985,2015)
by_player = df85_15.groupby("PLAYER_NAME")
MELOadvanceddf = pd.read_csv('./usage_stats/master_advanced.csv')

In [11]:
df85_15["FANTASY_ZSCORE"] = by_player["FANTASY_PTS"].apply(lambda x: ((x - x.mean())/x.std()))
df85_15["i_ZSCORE_OVER"] = df85_15["FANTASY_ZSCORE"].map(lambda x: 1 if x > 1 else 0)
df85_15["SEASON_MIN"] = by_player['MIN'].apply(lambda x: x.map(lambda y: x.sum()))
df85_15["GAMES_PLAYED"] = by_player["PLAYER_NAME"].apply(lambda x: x.map(lambda y: len(x)))
for x in ['FG_PCT', 'FG3_PCT', 'FT_PCT']:
    df85_15[x] = df85_15[x].map(lambda y: 0 if np.isnan(y) else y)
df85_15["WL"] = [1 if v == "W" else 0 for v in df85_15.copy()["WL"]]

opp_home = df85_15.MATCHUP.map(lambda x: (x[-3:],0) if "@" in x else (x[-3:],1))
df85_15["OPP"] = opp_home.map(lambda x: x[0])
df85_15["i_HOME"] = opp_home.map(lambda x: x[1])

In [12]:
#Add player bio data for age,weight,height
player_bios_df = pd.read_csv("./player_bios/player_bios.csv")
player_bios_df = player_bios_df.rename(columns = {'PERSON_ID': 'PLAYER_ID', 'DISPLAY_FIRST_LAST': 'PLAYER_NAME'})
player_bios_df["BIRTHDATE"] = pd.to_datetime(player_bios_df["BIRTHDATE"])
player_bios_df['AGE'] = player_bios_df["BIRTHDATE"].map(lambda x: round((pd.to_datetime('today') - x).days / 365.,2))
player_bios_df["WEIGHT"] = player_bios_df["WEIGHT"].astype('str')
player_bios_df["HEIGHT"] = player_bios_df["HEIGHT"].astype('str')
player_bios_df["WEIGHT"] = player_bios_df["WEIGHT"].map(lambda x:  float(x) if x != 'nan' else 0.)
player_bios_df["HEIGHT"] = player_bios_df["HEIGHT"].map(lambda x: (12.*float(x[0]) + float(x[2:])) if x != 'nan' else 0.)

by_player = df85_15.groupby("PLAYER_NAME")

In [13]:
def get_player_bio(name, col_name):
    return float(player_bios_df[player_bios_df.PLAYER_NAME == name][col_name])

df85_15["AGE"] = by_player["PLAYER_NAME"].apply(lambda x: x.replace(x.iloc[0],get_player_bio(x.iloc[0],"AGE")))
df85_15["WEIGHT"] = by_player["PLAYER_NAME"].apply(lambda x: x.replace(x.iloc[0],get_player_bio(x.iloc[0],"WEIGHT")))
df85_15["HEIGHT"] = by_player["PLAYER_NAME"].apply(lambda x: x.replace(x.iloc[0],get_player_bio(x.iloc[0],"HEIGHT")))

In [14]:
#Integrate ELO Rankings
elo_df = pd.read_csv("./gamelogs/all_elo.csv")
elo_df["date_game"] = pd.to_datetime(elo_df["date_game"])
elo_df["game_location"] = elo_df["game_location"].map(lambda x: 1 if x == "H" else 0)
elo_df = elo_df[elo_df["is_playoffs"] == 0]

curr = elo_df.columns.tolist()
cols = [curr[i] for i in [5,8,11,13,14,17,19,21]]
elo_df = elo_df[cols]
elo_df = elo_df.rename(columns={'date_game': 'GAME_DATE',
                                'team_id':'TEAM_ABBREVIATION',
                                'opp_id':'OPP', 
                                'game_location': 'i_HOME',
                                'elo_i':'ELO',
                                'opp_elo_i': 'OPP_ELO',
                                'win_equiv': 'EXP_WINS',
                                'forecast':'FORECAST'})

elo_df['SHIT'] = elo_df['OPP_ELO'].map(lambda x: 1 if x < 1400 else 0)
elo_df['OKAY'] = elo_df['OPP_ELO'].map(lambda x: 1 if 1400 <= x < 1600 else 0)
elo_df['GOOD'] = elo_df['OPP_ELO'].map(lambda x: 1 if 1600 <= x < 1700 else 0)
elo_df['GREAT'] = elo_df['OPP_ELO'].map(lambda x: 1 if 1700 <= x else 0)
df85_15 = df85_15.merge(season_subset(elo_df,1985,2015))


In [15]:
#Rearrange some columns in df85_15
curr = df85_15.columns.tolist()
cols = curr[:3] + curr[32:37] + curr[3:9] + curr[37:] + curr[9:32]
if len(curr) == len(cols):
    df85_15 = df85_15[cols]


name_pos = player_bios_df[["PLAYER_ID","POSITION","PLAYER_NAME"]]
df85_15 = df85_15.merge(name_pos)
df85_15["POSITION"] = df85_15.POSITION.map(lambda x: "Unknown" if pd.isnull(x) else x)

In [None]:
def calc_season_avg(df,col_list,(date_str1,date_str2)):
    date1, date2 = pd.to_datetime(date_str1), pd.to_datetime(date_str2)
    mask = lambda x: (date1 <= x) & (x <= date2)
    return df[df.GAME_DATE.apply(mask)].groupby(["PLAYER_NAME","SEASON_ID"])[col_list].mean().reset_index()

In [None]:
def ngames_colname(col_list, ngames):
    return map(lambda x: str(ngames) + 'D_' + x, col_list)

In [None]:
def last_ngames(df,ngames,game_date,col_list):
    ngames_df = df[df.GAME_DATE < game_date].nlargest(ngames, "GAME_DATE")
    ngames_col_list = ngames_colname(col_list,ngames)
    num_cols = len(ngames_col_list)
    date_player_tuples = [("GAME_DATE",game_date)]#,("PLAYER_NAME",df.PLAYER_NAME.iloc[0])]
    if ngames_df.empty:
        return dict(date_player_tuples + zip(ngames_col_list,np.array(0).repeat(num_cols)))
    else:
        return dict(date_player_tuples + zip(ngames_col_list,ngames_df[col_list].mean()))

In [None]:
def calc_ngame_avg(df,col_list,game_date_str,ngames):
    game_date = pd.to_datetime(game_date_str)
    season_id = df[df.GAME_DATE == game_date]["SEASON_ID"].iloc[0]
    return last_ngames(df[df.SEASON_ID == season_id],ngames,game_date,col_list)

In [None]:
def rolling_cols(df,col_list,ngames,rolling_kind):
    if rolling_kind == 'mean':
        rolling_func = lambda (a,b,c): pd.rolling_mean(a,b,min_periods = c)
    elif rolling_kind == 'sum':
        rolling_func = lambda (a,b,c): pd.rolling_sum(a,b,min_periods = c)
    else:
        return None 
    
    rolling_df = (df.groupby(["PLAYER_NAME","SEASON_ID"])
                    .apply(lambda x: add_game_date_pts_col(rolling_func((x[col_list],ngames,1)),x.GAME_DATE,x.FANTASY_PTS).reset_index(drop = True)))
    return rolling_df.reset_index().drop('level_2',axis = 1).rename(columns=dict(zip(col_list,map(lambda x: 'R_' + x,col_list))))

In [None]:
def add_game_date_pts_col(df,game_date_col,fantasy_pts_col):
    new_df = pd.concat([df,game_date_col], axis = 1)
    return new_df

In [None]:
def per_season_cumsum(df,col_list):
    cumsum_df = (df.groupby(["PLAYER_NAME","SEASON_ID"])
                   .apply(lambda x: add_game_date_col(x[col_list].cumsum(axis = 0), x.GAME_DATE).reset_index(drop = True)))
    return cumsum_df.reset_index().drop('level_2',axis = 1).rename(columns=dict(zip(col_list,map(lambda x: 'C_' + x,col_list))))

In [None]:
def per_season_cummean(df,col_list):
    cumsum_df = (df.groupby(["PLAYER_NAME","SEASON_ID"])
                   .apply(lambda x: add_game_date_pts_col(pd.expanding_mean(x[col_list], min_periods = 2), x.GAME_DATE, x.FANTASY_PTS).reset_index(drop = True)))
    return cumsum_df.reset_index().drop('level_2',axis = 1).rename(columns=dict(zip(col_list,map(lambda x: 'C_' + x,col_list))))

In [None]:
def enumerate_games(df):
    new_df = df.copy()
    new_df["GAME_NUM"] = range(1,len(df.GAME_DATE) + 1)
    return new_df

def sigmoidfun(x):
	return 1/(1+np.exp(-0.007*(x-800)))

def fantasy_avg_lastn(player_df,last_n_seasons,seasons):
    return player_df[[s in seasons[-last_n_seasons:] for s in player_df.SEASON_ID]]['FANTASY_PTS'].mean()    

def true_fantasy_mean(player_df,last_n_seasons):
    seasons = list(set(player_df.SEASON_ID))
    lastn_mean = fantasy_avg_lastn(player_df,last_n_seasons,seasons)
    return player_df.groupby("SEASON_ID").apply(lambda x: x.apply(lambda y: lastn_mean + sigmoidfun(y.MIN) * (y.C_FANTASY_PTS - lastn_mean),axis = 1))

def fantasy_resp(df):
    return df.groupby('PLAYER_NAME').apply(lambda x: true_fantasy_mean(x,5))

In [135]:
def timeseries_cv(df,lcols,resp_str,nfolds):
    resp_str = resp_str + '_RESP'
    train_size = test_df.shape[0]
    floor_fold_size = (train_size / nfolds)
    final_fold_size = floor_fold_size +  (train_size % nfolds)
    rest = train_size - final_fold_size
    rest_size = rest / (nfolds - 1)
    final_idx = lambda x: (final_fold_size * x, final_fold_size * (x + 1))
    rest_idx = lambda x: (rest_size * x, rest_size * (x + 1))
    folds_idx = map(lambda x: final_idx(x) if x == (nfolds - 1) else rest_idx(x),range(nfolds))
    folds = map(lambda (x,y): df.iloc[x:y], folds_idx)
    xtrain,ytrain = zip(*map(lambda x: (x[:-1][lcols].values, x[:-1][resp_str].values), folds))
    xtest,ytest = zip(*map(lambda x: (x[-1:][lcols].values, x[-1:][resp_str].values), folds))
    return xtrain,ytrain,xtest,ytest
    


In [134]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression as LR
from sklearn.decomposition import PCA
#train_test_split(xrange(df.shape[0]), train_size=0.7)

def mape(ypred, ytrue):
    """ returns the mean absolute percentage error """
    idx = ytrue != 0.0
    return 100*np.mean(np.abs(ypred[idx]-ytrue[idx])/ytrue[idx])

def run_classifier(df, mask, ewma_colresp,ewma_colfeats):
    dftouse = df.copy()

    ewma_feats = map(lambda x: 'EWMA_LOG_' + x , ewma_colfeats)
    STANDARDIZABLE = ['EWMA_LOG_' + ewma_colresp, 'EWMA_OPP_POS'] + ewma_feats
    for col in STANDARDIZABLE:
        print col
        valstrain=df[col].values[mask]
        valstest=df[col].values[~mask]
        scaler=StandardScaler().fit(valstrain)
        outtrain=scaler.transform(valstrain)
        outtest=scaler.fit_transform(valstest)
        out=np.empty(mask.shape[0])
        out[mask]=outtrain
        out[~mask]=outtest
        dftouse[col]=out

    lcols = STANDARDIZABLE + ["OKAY","GOOD","GREAT"]


    clf = LR()
    #cs=[.0001,.001,.01,.1,1,10]
    #n_estimators = [1,2,10,100,500,1000]
    #max_depth = [2,3,5,7,10]
    #pca = PCA(n_components=5)
    #feats = list(set(lcols) - set(['OKAY','GOOD','GREAT']))

    
    Xmatrix=dftouse[lcols].values#pca.fit_transform(np.array(dftouse[feats]))
    Yresp=dftouse[ewma_colresp + '_RESP'].values 
    Xmatrix_train=Xmatrix[mask]
    Xmatrix_test=Xmatrix[~mask]
    Yresp_train=Yresp[mask]
    Yresp_test=Yresp[~mask]

    #your code here
    # from sklearn.grid_search import GridSearchCV
    # #{'n_estimators':n_estimators,'max_depth':max_depth}
    # gs=GridSearchCV(clfsvm, param_grid={'C':cs}, cv=5)
    # gs.fit(Xmatrix_train, Yresp_train)
    # print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_

    # #calculate the accuracy here
    # best = gs.best_estimator_
    # best.fit(Xmatrix_train, Yresp_train)
    # best.score(Xmatrix_test, Yresp_test)
    return clf, Xmatrix_train, Yresp_train, Xmatrix_test, Yresp_test

In [None]:
#xtrain_cv, ytrain_cv, xtest_cv, ytest_cv = timeseries_cv(dftouse[mask],lcols,ewma_colresp,cv_folds)

In [4]:
def get_player_seasons(player_name, game_date,df,ewma_colresp, ewma_colfeat):
    player_df = df[df.PLAYER_NAME == player_name]
    player_df2 = pd.concat([player_df.reset_index(drop = True),player_df.groupby("SEASON_ID").apply(lambda x: np.log(pd.ewma(x[ewma_colresp], span = 3).shift(1) + 2.5).reset_index().drop('index',axis=1).rename(columns={ewma_colresp:'EWMA_LOG_' + ewma_colresp})).reset_index(drop = True)],axis = 1)
    for ewma_col in ewma_colfeat:
        player_df2['EWMA_LOG_' + ewma_col] = player_df2.groupby("SEASON_ID").apply(lambda x: np.log(pd.ewma(x[ewma_col], span = 3).shift(1) + 2.5).reset_index().drop('index',axis=1).rename(columns={ewma_col:'EWMA_LOG_' + ewma_col})).reset_index(drop = True)
    #1 if np.log(y[ewma_colresp] + 1) >= y['EWMA_LOG_' + ewma_colresp] else 0
    resp = player_df2.groupby('SEASON_ID').apply(lambda x: x.apply(lambda y: np.log(y[ewma_colresp] + 2.5), axis = 1).reset_index().drop('index',axis=1).rename(columns={0: ewma_colresp + '_RESP'})).reset_index(drop = True)
    player_df3 = pd.concat([player_df2,resp], axis = 1)
    player_df_final = player_df3.dropna()
    return player_df_final, np.array(player_df_final.GAME_DATE < game_date)

In [16]:
def filter_players_by_season_count(df,players):
    season_count = lambda x: len(set(df[df.PLAYER_NAME == x].SEASON_ID))
    sub_players = filter(lambda x: season_count(x) >= 2, players)
    return sub_players

def reduce_picks(player_name,game_date, df, ewma_colresp, ewma_colfeats):
    seasons = list(set(df[df.PLAYER_NAME == player_name].SEASON_ID))
    dftouse,mask = get_player_seasons(player_name, game_date, df, ewma_colresp, ewma_colfeats)
    clf,xtrain,ytrain,xtest,ytest = run_classifier(dftouse,mask,ewma_colresp, ewma_colfeats)
    clf.fit(xtrain,ytrain)
    print player_name
    print 'The error is %0.2f%%' % mape(clf.predict(xtest),ytest)
    dfreturn = dftouse[~mask].copy()
    dfreturn['PRED' + ewma_colresp] = clf.predict(xtest)
    return dfreturn

def min_season(df,players):
    season = sorted(map(lambda x: df[df.PLAYER_NAME == x].SEASON_ID.min(),players))[0]
    return season

def make_ewma_pos_df(df, game_date):
    game_day_df = df[(df.GAME_DATE == game_date)]
    sub_df = df[(df.GAME_DATE <= game_date)]
    potential_players = list(set(game_day_df.PLAYER_NAME))
    players = filter_players_by_season_count(sub_df[['PLAYER_NAME','SEASON_ID']],potential_players)
    lower_bound = min_season(sub_df[['PLAYER_NAME','SEASON_ID']],players)
    sub_df2 = sub_df[sub_df.SEASON_ID >= lower_bound]
    ewma_pos = sub_df2.groupby(["OPP",'SEASON_ID',"POSITION","GAME_DATE"]).apply(lambda x: x.FANTASY_PTS.sum())

    ewma_pos_df_temp = (ewma_pos.reset_index().rename(columns={0:'TOT_OPP_POS'})
                                .sort_values('GAME_DATE')
                                .groupby(["OPP",'SEASON_ID',"POSITION"])
                                .apply(lambda x: 
                                    pd.DataFrame(zip(x.GAME_DATE,[-5 if np.isinf(y) else y for y in np.log(pd.ewma(x.TOT_OPP_POS, span = 3).shift(1) + 2.5)]), 
                                    index = range(x.shape[0])))
                                .rename(columns={0:'GAME_DATE',1:'EWMA_OPP_POS'})
                                .reset_index(level = [0,1,2]))
    merge_on = ['OPP','GAME_DATE','POSITION','SEASON_ID']
    ewma_pos_df = pd.merge(sub_df2,ewma_pos_df_temp,left_on=merge_on, right_on=merge_on)
    league_avg_df = (ewma_pos_df.groupby(["SEASON_ID",'POSITION'])
                     .apply(lambda x: x['EWMA_OPP_POS'].mean())
                     .reset_index()
                     .rename(columns={0:'LEAGUE_AVG_POS'}))
    nan_dict = dict(reduce(lambda x,y: x + y.items(),[{(k1,k2):v} for k1,k2,v in league_avg_df.to_records(index = False)], []))
    nan_rows = pd.isnull(ewma_pos_df['EWMA_OPP_POS'])
    ewma_pos_df.loc[nan_rows,'EWMA_OPP_POS'] = ewma_pos_df[nan_rows].apply(lambda x: nan_dict[x.SEASON_ID - 1,x.POSITION] if x.SEASON_ID > lower_bound else float('nan'), axis = 1)
    return ewma_pos_df, players

def classify_players_ondate(df,players, game_date,ewma_colresp, ewma_colfeats):
    store_df = []
    for player in players:
        print player
        store_df.append(reduce_picks(player,game_date, df, ewma_colresp, ewma_colfeats))
    return pd.concat(store_df, axis = 0)

In [8]:
def make_player_pool(df,game_date,ewma_colresp, ewma_colfeats):
    ewma_pos_df, players = make_ewma_pos_df(df, game_date)
    return classify_players_ondate(ewma_pos_df, players,game_date,ewma_colresp,ewma_colfeats)

In [None]:
def CM_HEIGHT(x):
    MELO_HT = x['HEIGHT'] * 4.5
    return 'MELO_HT',MELO_HT

def CM_WEIGHT(x):
    MELO_WT = x['WEIGHT'] * 2.0
    return 'MELO_WT',MELO_WT

def CM_CAREER_MINUTES(x):
    MELO_CAREER_MIN = x['SEASON_MIN'] * 2.5
    return 'MELO_CAREER_MIN', MELO_CAREER_MIN

def CM_AGE(x):
    MELO_AGE = x['AGE']
    return 'MELO_AGE', MELO_AGE

def CM_MIN_PER(x):
    MIN_PER = x['MIN'] * 4.5
    return 'MELO_MIN_PER',MIN_PER

def CM_MIN_TOT(x):
    MIN_TOT = (x['MIN'] * x['GP']) * 7
    return 'MELO_MIN_TOT', MIN_TOT

def CM_TRUE_PER(x):
    TRUE_PER = x['TS_PCT'] * 6
    return 'MELO_TRUE_PER',TRUE_PER

def CM_USG_PER(x):
    USG_PER = x['USG_PCT'] * 6
    return 'MELO_USG_PER',USG_PER

def CM_AST_PER(x):
    AST_PER = x['AST_PCT'] * 5
    return 'MELO_AST_PCT', AST_PER

def CM_TO_PER(x):
    TO_PER= x['TM_TOV_PCT'] * 2.5
    return 'MELO_TO_PCT', TO_PER

def CM_REB_PER(x):
    REB_PER = x['REB_PCT'] * 5
    return 'MELO_REB_PCT', REB_PER

def CM_OFF_PM(x):
    OFF_PM= x['OFF_RATING'] * 3
    return 'OFF_PM', OFF_PM

def CM_DF_PM(x):
    DEF_PM= x['DEF_RATING'] * 3
    return 'DEF_PM', DEF_PM

def CM_3FEQ(x):
    MELO_3FEQ = x['3PT_FEQ'] * 3.5
    return 'MELO_3FEQ',MELO_3FEQ

def CM_FT_PER(x):
    MELO_FT_PER = x['FT_PER'] * 3.5
    return 'MELO_FT_PER',MELO_FT_PER

def weight_prop(cat_str, weight_dict):
    tot = sum(weight_dict.values())
    prop = weight_dict[cat_str] / tot
    return prop

def make_melo_sim(fab_std,cat_str):
    fab_std_player_idx = fab_std.set_index('PLAYER_NAME')
    fab_comp = pd.DataFrame(index=fab_std_player_idx.index.tolist(), columns=fab_std_player_idx.index.tolist())
    prop = weight_prop(cat_str, weights)
    melo_category = (fab_comp.apply(lambda x: fab_comp.columns,axis = 1)
             .apply(lambda x: fab_std_player_idx.loc[x.name][cat_str] - fab_std_player_idx.loc[x][cat_str], axis = 1)
             .applymap(lambda x: x**2 * prop))
    return melo_category

def fab_melo(player, comboMELO):
    root = comboMELO[comboMELO.PLAYER_NAME == player].sort_values('PLAYER_NAME')
    calc_melo_funcs = [CM_WEIGHT, CM_HEIGHT, CM_MIN_PER,CM_CAREER_MINUTES, CM_3FEQ, CM_MIN_TOT, CM_TRUE_PER, CM_USG_PER, CM_AST_PER, CM_TO_PER, CM_REB_PER, CM_OFF_PM, CM_DF_PM,CM_FT_PER,CM_AGE]
    result = root.groupby('SEASON_ID').apply(lambda x: pd.DataFrame(dict([('SEASON_ID',x.SEASON_ID),('PLAYER_NAME',x.PLAYER_NAME)] + map(lambda y: y(x),calc_melo_funcs))))
    return result

def zscore(col):
    return (col - col.mean())/col.std(ddof=0)
    
store_df = []
melo_advanced_df = pd.read_csv("./usage_stats/comboMELO.csv") 
players = set(season_subset(df85_15,1996,2015)['PLAYER_NAME'])
for player in players:
    store_df.append(fab_melo(player,melo_advanced_df))
FAB_MELO = pd.concat(store_df,axis = 0)
melo_cols = ["MELO_MIN_PER", "MELO_MIN_TOT", "DEF_PM","OFF_PM", "MELO_AST_PCT", "MELO_REB_PCT", "MELO_TO_PCT","MELO_USG_PER", "MELO_TRUE_PER","MELO_3FEQ","MELO_FT_PER","MELO_CAREER_MIN","MELO_WT","MELO_HT"]
weights = dict(zip(melo_cols,[4.5,7.0,3.0,3.0,5.0,5.0,2.5,6.0,6.0,3.5,3.5,2.5,2,4.5]))
FAB_MELO[melo_cols] = FAB_MELO[melo_cols].apply(zscore, axis =0)
get_top_ten(FAB_MELO[FAB_MELO.AGE == 26],weights,"Danny Green")

In [None]:
store_df2 = []
for i in range(6):
    store_df2.append(pd.read_csv('./DKSalaries/DKSalaries' + str(i) + '.csv'))
salary_df = pd.concat(store_df,axis = 0)
opt_players = list(set(salary_df.Name))
sampled_salary = salary_df.groupby("Name").apply(lambda x: x.sample(n=1)).reset_index(drop = True)
salary_dict = dict(zip(sampled_salary.Name, sampled_salary.Salary))
salary_dict

In [None]:
gafantasypts = singleday['FANTASY_PTS'].values
gaforwards = singleday['REAL_POSITION'].map(lambda x: 1 if x == 'Forward' else 0).values
gaguards = singleday['REAL_POSITION'].map(lambda x: 1 if x == 'Guard' else 0).values
gacenters = singleday['REAL_POSITION'].map(lambda x: 1 if x == 'Center' else 0).values
#gautil = np.ones(len(gacenters))
gasalaries = singleday['PLAYER_NAME'].map(lambda x: salary_dict[x]).values

In [None]:
small_data = zip(gasalaries, gafantasypts, gaforwards, gaguards, gacenters)#,gautil)

In [None]:
from pyeasyga import pyeasyga

ga = pyeasyga.GeneticAlgorithm(small_data)        # initialise the GA with data
ga.population_size = 200000
#ga.mutation_probability = 0.05
#ga.generations = 25

# define a fitness function
def fitness(individual, data):
    salaries, points, forwards, guards, centers = 0, 0, 0, 0, 0
    for (selected, item) in zip(individual, data):
        if selected:
            salaries += item[0]
            points += item[1]
            forwards += item[2]
            guards += item[3]
            centers += item[4]
    if salaries > 50000 or ((forwards > 4) and (guards > 4)) or ((guards > 4) and (centers > 2)) or ((forwards > 4) and (centers > 2)) or ((centers + gaurds + forwards) > 8):
        points = 0
    return points

ga.fitness_function = fitness               # set the GA's fitness function
ga.run()                                    # run the GA
print ga.best_individual()                  # print the GA's best solution

In [None]:
_,mask = ga.best_individual()
mask = np.array(mask) == 1
singleday[mask]

In [2]:
def prepareTimeSeriesCV(X_train, y_train, number_folds):
    k = int(np.floor(float(X_train.shape[0]) / number_folds))
    
    accuracies = np.zeros(number_folds-1)
    X_trainFolds, y_trainFolds, X_testFolds, y_testFolds = [], [], [], []
    
    for i in range(2, number_folds + 1):
        split = float(i-1)/i
        
        X = X_train[:(k*i)]
        y = y_train[:(k*i)]
        
        index = int(np.floor(X.shape[0] * split))
        
        X_trainFolds.append(X[:index])      
        y_trainFolds.append(y[:index])
        
        X_testFolds.append(X[(index + 1):])
        y_testFolds.append(y[(index + 1):])

    return X_trainFolds, y_trainFolds, X_testFolds, y_testFolds

In [None]:
Xmatrix_train=Xmatrix[mask]
Yresp_train=Yresp[mask]
X_train = Xmatrix_train
y_train = Yresp_train

In [73]:
folds = map(lambda x: test_df.iloc[236*x:236*(x+1)], range(test_df.shape[0] / 4))

In [125]:
xtrain,ytrain,xtest,ytest = timeseries_cv(test_df,"FANTASY_PTS_RESP",7)

In [128]:
xtest[0]

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,SEASON_MIN,GAMES_PLAYED,OPP,i_HOME,AGE,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,WEIGHT,HEIGHT,ELO,EXP_WINS,OPP_ELO,FORECAST,SHIT,OKAY,GOOD,GREAT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,FANTASY_ZSCORE,i_ZSCORE_OVER,POSITION,EWMA_OPP_POS,EWMA_LOG_FANTASY_PTS,EWMA_LOG_PTS,FANTASY_PTS_RESP
134,21997,977,Kobe Bryant,46780,1280,VAN,1,37.32,LAL,Los Angeles Lakers,29701106,1998-04-08,LAL vs. VAN,1,212,78,1692.6766,60.223038,1245.7842,0.958836,1,0,0,0,26,4,10,0.4,1,5,0.2,5,6,0.833,0,1,1,4,0,2,5,3,14,7,23.25,-1.30411,0,Forward-Guard,3.136437,3.825832,3.202615,3.401197


In [143]:
test_df = df[mask].sort_values('GAME_DATE').reset_index(drop = True)

In [137]:
import itertools
import operator
from sklearn.svm import LinearSVC, SVR
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#GridSearchCV, authored by David DiCiurcio
def davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, parameters, classifier):    
    templist = []
    paramlist = []
    outputs = {}
    clflist = []
    
    for i in parameters:
        templist.append(parameters[i])
    zlist = list(itertools.product(*templist))

    for i in zlist:
        paramlist.append(dict(zip(parameters.keys(),i)))

    counter = 0
    for i in paramlist:
        stringexec = ''
        for k in i:
            stringexec = stringexec+k+"="+str(i[k])+","
        exec "clf = "+classifier+"("+stringexec[:-1]+")"
        averageaccuracy = []
        for j in range(0,len(X_trainFolds)):
            clf.fit(X_trainFolds[j], y_trainFolds[j])
            averageaccuracy.append(clf.score(X_testFolds[j], y_testFolds[j]))
            clflist.append(clf)
        outputs[counter] = np.mean(averageaccuracy)
        counter = counter + 1
    accmaxindex = max(outputs.iteritems(), key=operator.itemgetter(1))[0]
    return clflist, paramlist, accmaxindex, outputs[accmaxindex]

In [139]:
x = [0.01, 0.1, 1., 10., 100.]
y = [500, 1000, 2000, 4000]
z = [5, 10, 20, 50]
parameters1={'C':x, 'max_iter':y}
parameters={'n_estimators':z}
#clflist, vala, valb, valc = davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, parameters,'ExtraTreesClassifier')

In [150]:
test_df.head()

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,SEASON_MIN,GAMES_PLAYED,OPP,i_HOME,AGE,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,WEIGHT,HEIGHT,ELO,EXP_WINS,OPP_ELO,FORECAST,SHIT,OKAY,GOOD,GREAT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,FANTASY_ZSCORE,i_ZSCORE_OVER,POSITION,EWMA_OPP_POS,EWMA_LOG_FANTASY_PTS,EWMA_LOG_PTS,FANTASY_PTS_RESP
0,21996,977,Kobe Bryant,46780,1280,MIN,1,37.32,LAL,Los Angeles Lakers,29600027,1996-11-03,LAL vs. MIN,1,212,78,1603.3704,52.646591,1396.9335,0.853709,1,0,0,0,6,0,1,0.0,0,0,0.0,0,0,0.0,0,1,1,0,0,1,1,1,0,-8,2.75,-2.710627,0,Forward-Guard,3.50778,2.718491,2.498472,2.484907
1,21996,977,Kobe Bryant,46780,1280,NYK,0,37.32,LAL,Los Angeles Lakers,29600031,1996-11-05,LAL @ NYK,1,212,78,1605.1874,53.564236,1567.5536,0.4112,0,1,0,0,3,0,1,0.0,0,0,0.0,1,2,0.5,0,0,0,0,0,0,1,0,1,-8,0.5,-2.865,0,Forward-Guard,3.50778,2.608475,2.119985,1.909543
2,21996,977,Kobe Bryant,46780,1280,CHH,0,37.32,LAL,Los Angeles Lakers,29600044,1996-11-06,LAL @ CHH,0,212,78,1614.7716,52.422737,1475.9551,0.55563,0,1,0,0,7,2,3,0.667,1,2,0.5,0,0,0.0,0,0,0,0,0,0,3,0,5,0,4.0,-2.624863,0,Forward-Guard,3.50778,2.318827,1.777527,1.658228
3,21996,977,Kobe Bryant,46780,1280,TOR,0,37.32,LAL,Los Angeles Lakers,29600057,1996-11-08,LAL @ TOR,0,212,78,1602.8699,51.741318,1320.639,0.740584,1,0,0,0,17,3,8,0.375,2,3,0.667,2,4,0.5,0,3,3,0,0,1,0,3,10,2,16.75,-1.750079,0,Forward-Guard,3.496508,2.042093,1.650112,2.818398
4,21996,977,Kobe Bryant,46780,1280,ATL,1,37.32,LAL,Los Angeles Lakers,29600072,1996-11-10,LAL vs. ATL,1,212,78,1595.8615,52.247692,1579.1018,0.661978,0,1,0,0,8,0,3,0.0,0,0,0.0,2,2,1.0,1,2,3,1,0,0,0,0,2,-4,7.25,-2.401879,0,Forward-Guard,2.902206,2.503769,2.061005,3.341093


In [153]:
#How to perform cross-validation and davidsearchcv
X_trainFolds, y_trainFolds, X_testFolds, y_testFolds = timeseries_cv(test_df,['EWMA_LOG_PTS','EWMA_OPP_POS'],'FANTASY_PTS',7)
clflist, vala, valb, valc = davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, parameters,'ExtraTreesClassifier')
#clflist1, vala1, valb1, valc1 = davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds, parameters1,'LinearSVC')

ValueError: continuous is not supported

In [152]:
test_df.iloc[0:100]['EWMA_LOG_PTS'].values

array([ 2.49847242,  2.11998511,  1.77752731,  1.65011194,  2.0610054 ,
        2.36808947,  1.88531901,  1.9522088 ,  1.56278866,  1.81418228,
        1.46279868,  1.68806681,  2.67100634,  2.80198089,  2.60184688,
        2.52527362,  2.1397316 ,  1.87158344,  2.70800281,  2.46382299,
        2.74886082,  2.53070911,  2.60499854,  2.86309207,  2.56554928,
        2.32765819,  1.99836032,  2.24479444,  1.78661921,  2.22295933,
        1.92677238,  1.54407872,  1.27866108,  2.08516434,  1.66259127,
        2.0651497 ,  2.41530995,  2.7629562 ,  2.84335642,  2.6971008 ,
        2.53910498,  2.02606235,  1.61781906,  2.05040412,  2.09623972,
        2.69582648,  2.92632218,  2.71333223,  2.46722797,  2.68408942,
        2.71286725,  3.03429003,  2.8415962 ,  2.76137317,  2.36657129,
        2.20611697,  3.23867845,  2.55204595,  2.22075507,  2.94268305,
        2.78162039,  2.93731256,  3.02998153,  3.16166205,  3.04689591,
        2.93240514,  3.0025604 ,  2.66145421,  2.59588407,  2.80

In [None]:
#define list of parameters and range of parameters
searchlist = [(parameters,'ExtraTreesClassifier'),(parameters1,'LinearSVC')]
VCclfLst = []
for i in searchlist:
    clflist,_,accmaxindex,_ = davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds,i[0],i[1])
    VCclfLst.append((i[1],clflist[accmaxindex]))

In [None]:
VCclflst = []
VCclflst.append(('ETC',clflist[valb]))
VCclflst.append(('LSVC',clflist1[valb1]))

In [None]:
from sklearn.ensemble import VotingClassifier

# clflist in form of ('lr', clf1)
# weightlist in form of [1, 2, 4]
# voting in form of 'soft' or 'hard'
def runVotingClassifier(Xtrain,ytrain,Xtest,ytest,clflist,weights,voting):
    vcaverage = []
    for i in range(0,len(Xtrain)):
        eclf = VotingClassifier(estimators=clflist,voting=voting,weights=weights)
        eclf.fit(Xtrain[i], ytrain[i])
        vcaverage.append(eclf.score(Xtest[i], ytest[i]))
    return np.mean(vcaverage)

In [None]:
runVotingClassifier(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds,VCclfLst,[1,1],'hard')
np.ones(len(searchlist))

In [None]:
def ClassifierComp(Xtrain, ytrain, kfolds, searchlst,weights,voting):
    X_trainFolds, y_trainFolds, X_testFolds, y_testFolds = prepareTimeSeriesCV(Xtrain, ytrain, kfolds)
    VCclfLst = []
    for i in searchlist:
        clflist,_,accmaxindex,_ = davidsearchcv(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds,i[0],i[1])
        VCclfLst.append((i[1],clflist[accmaxindex]))
    return runVotingClassifier(X_trainFolds, y_trainFolds, X_testFolds, y_testFolds,VCclfLst,weights,voting)

In [None]:
ClassifierComp(X_train, y_train, 10, searchlist,[1,1],'hard')

In [None]:
ewma_colfeats =['PTS','AST']
df,mask = get_player_seasons("Kobe Bryant",2005,2006,ewma_pos_df,ewma_colresp,ewma_colfeats)

Xmatrix=df[lcols].values
Yresp=df[ewma_colresp + '_RESP'].values 
Xmatrix_train=Xmatrix[mask]
Yresp_train=Yresp[mask]
X_train = Xmatrix_train
y_train = Yresp_train