# Background reading

### Resources

#### Predicting Results for Professional Basketball Using NBA API Data (2016)
http://cs229.stanford.edu/proj2016/report/PerriconeShawSwiechowicz-PredictingResultsforProfessionalBasketballUsingNBAAPIData.pdf

-  
- 

#### Predicting NBA games using neural networks (2009) 
http://www.perducosports.com/media/NBA_Article.pdf

### Ideas

# Load some data and take a look

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import numpy as np
import pandas as pd
from nba_predict import *
from pandas.io.json import json_normalize
import json
import ast

In [3]:
with open('F:/Documents/_DataScience/nba_data/test.json', 'r') as f:
    data = json.loads(f.read())
    
raw_df = pd.DataFrame(data)   

In [4]:
raw_df.columns

Index(['box_score', 'day', 'home_city', 'home_line', 'home_name', 'summary',
       'vis_city', 'vis_line', 'vis_name'],
      dtype='object')

In [49]:
def only_dict(d):
    '''
    Convert json string representation of dictionary to a python dict
    '''
    return ast.literal_eval(d)

def list_of_dicts(ld):
    '''
    Create a mapping of the tuples formed after 
    converting json strings of list to a python list   
    '''
    return dict([(list(d.values())[1], list(d.values())[0]) for d in ast.literal_eval(ld)])

def extract_json_column(df, col, prefix):
    '''
    Convert a column containing json objects to a new df
    '''
    return json_normalize(df[col].astype("str").apply(only_dict).tolist()).add_prefix(prefix)

def fix_column_names(x):
    '''
    Tidy up column names from file for easier reading
    '''
    
    y = x.replace('-', '_')
    y = y.lower()
    y = y.replace('team_', '')
    return y

def extract_box_scores(df):
    home = extract_json_column(df, 'home_line', 'home_')
    away = extract_json_column(df, 'vis_line', 'away_')
    
    box = home.merge(away, left_index=True, right_index=True)
    box = box.merge(df[['day']], left_index=True, right_index=True)
    
    box.rename(fix_column_names, axis='columns', inplace=True)
    box.drop(['home_city', 'away_city'], inplace=True, axis=1)
    
    #Reorder
    to_front = ['day', 'home_name', 'away_name']
    cols = box.columns.tolist()
    for i in to_front:
        cols.remove(i)
    cols = to_front + cols
    box['day'] = pd.to_datetime(box['day'], format='%m_%d_%y')
       
    return box[cols]

def get_season(df):
    df2 = df.copy()
    year = 2018
    while year > 2005:
        max_date = pd.to_datetime('{}-10-01'.format(year))
        min_date = pd.to_datetime('{}-10-01'.format(year-1))
        df2.loc[(min_date < df2.day) & (df2.day < max_date),'season'] = '{}-{}'.format(year-1, year)
        df2.loc[(min_date < df2.day) & (df2.day < max_date),'season_max_date'] = max_date
        df2.loc[(min_date < df2.day) & (df2.day < max_date),'season_min_date'] = min_date
        year -= 1
    return df2

scores = extract_box_scores(raw_df)
scores.sort_values(by=['day'], ascending=False, inplace=True)
scores = get_season(scores)

# scores2 = get_season_to_date_stats(scores)

# def get_last_n_to_date_stats(df, n):
#     for team in scores.home_name.unique():
#         # get home games
#         df[home_name == team]
#         # get away games
        
        
#         df[df.tail(3).mean() > df.mean()]
    
scores.loc[scores.home_name == 'Wizards',['season', 'day', 'season_max_date', 'home_wins', 'home_pts', 'away_pts']].head(20)

Unnamed: 0,season,day,season_max_date,home_wins,home_pts,away_pts
1312,2016-2017,2017-03-25,2017-10-01,47,115,127
967,2016-2017,2017-03-22,2017-10-01,43,104,100
878,2016-2017,2017-03-08,2017-10-01,29,113,123
1111,2016-2017,2017-03-07,2017-10-01,21,127,131
541,2016-2017,2017-03-01,2017-10-01,36,106,114
415,2016-2017,2017-01-19,2017-10-01,19,110,113
515,2016-2017,2017-01-19,2017-10-01,19,110,113
52,2016-2017,2016-12-06,2017-10-01,7,116,124
119,2016-2017,2016-11-30,2017-10-01,12,126,115
883,2016-2017,2016-11-16,2017-10-01,2,109,102


In [47]:
scores.columns.unique()
# scores.home_name.unique()

Index(['day', 'home_name', 'away_name', 'home_ast', 'home_fg3_pct',
       'home_fg_pct', 'home_ft_pct', 'home_losses', 'home_pts',
       'home_pts_qtr1', 'home_pts_qtr2', 'home_pts_qtr3', 'home_pts_qtr4',
       'home_reb', 'home_tov', 'home_wins', 'away_ast', 'away_fg3_pct',
       'away_fg_pct', 'away_ft_pct', 'away_losses', 'away_pts',
       'away_pts_qtr1', 'away_pts_qtr2', 'away_pts_qtr3', 'away_pts_qtr4',
       'away_reb', 'away_tov', 'away_wins', 'season', 'season_max_date',
       'season_min_date'],
      dtype='object')

In [17]:
scores.day.describe()

count                    1635
unique                   1016
top       2017-02-08 00:00:00
freq                        6
first     2006-11-03 00:00:00
last      2017-03-26 00:00:00
Name: day, dtype: object