In [1]:
# Libraries
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import random
import urllib
import urllib2
import re
import scipy.stats as stats
import unicodedata
import datetime
import requests
import requests_cache
requests_cache.install_cache('demo_cache')
%matplotlib inline

## Helper functions

In [21]:
def get_reg_season(df, yr):
    return df[(df['datetime'] < datetime.date(yr,4,1)) | (df['g_num'] > 7)]

In [22]:
def get_playoffs(df, yr):
    return df[(df['datetime'] > datetime.date(yr,4,1)) & (df['g_num'] <= 7)]

In [28]:
def timestamp_to_datetime(timestamp):
    return datetime.date(timestamp.year, timestamp.month, timestamp.day)

In [29]:
def fix_dates(df):
    df = df.reset_index()
    datetimes = []
    for timestamp in df['date']:
        datetimes.append(timestamp_to_datetime(timestamp))
    df['datetime'] = datetimes
    return df

In [52]:
game_df = pd.read_excel('data/game_ginis/game_gini_2003_mp2.xlsx')
game_df = fix_dates(game_df)

In [53]:
game_df.head()

Unnamed: 0,index,date,g_id,g_num,gini_10,gini_2,gini_3,gini_4,gini_5,gini_6,gini_7,gini_8,gini_9,opp,team,season,datetime
0,0,2003-01-20,200301200NYK,39,0.571551,0.413203,0.415822,0.426961,0.411157,0.408177,0.419977,0.462596,0.517995,MIA,NYK,2003,2003-01-20
1,1,2003-01-17,200301170DEN,39,0.548925,0.286367,0.248595,0.217609,0.317703,0.390237,0.442998,0.486486,0.515693,CLE,DEN,2003,2003-01-17
2,2,2003-04-05,200304050CHI,77,0.616048,0.236509,0.184107,0.278255,0.302861,0.392473,0.455795,0.519566,0.56888,MIL,CHI,2003,2003-04-05
3,3,2003-03-11,200303110NYK,63,0.493934,0.044519,0.12018,0.1593,0.233726,0.316799,0.369994,0.439561,0.493934,NYK,MEM,2003,2003-03-11
4,4,2003-01-08,200301080UTA,34,0.504254,0.184042,0.246517,0.322625,0.356151,0.371337,0.393906,0.415897,0.46337,PHO,UTA,2003,2003-01-08


In [55]:
reg_df = get_reg_season(game_df, 2003)

In [60]:
game_df[(game_df['datetime'] < datetime.date(2003,3,1)) | (game_df['g_num'] > 7)]

Unnamed: 0,index,date,g_id,g_num,gini_10,gini_2,gini_3,gini_4,gini_5,gini_6,gini_7,gini_8,gini_9,opp,team,season,datetime
0,0,2003-01-20,200301200NYK,39,0.571551,0.413203,0.415822,0.426961,0.411157,0.408177,0.419977,0.462596,0.517995,MIA,NYK,2003,2003-01-20
1,1,2003-01-17,200301170DEN,39,0.548925,0.286367,0.248595,0.217609,0.317703,0.390237,0.442998,0.486486,0.515693,CLE,DEN,2003,2003-01-17
2,2,2003-04-05,200304050CHI,77,0.616048,0.236509,0.184107,0.278255,0.302861,0.392473,0.455795,0.519566,0.568880,MIL,CHI,2003,2003-04-05
3,3,2003-03-11,200303110NYK,63,0.493934,0.044519,0.120180,0.159300,0.233726,0.316799,0.369994,0.439561,0.493934,NYK,MEM,2003,2003-03-11
4,4,2003-01-08,200301080UTA,34,0.504254,0.184042,0.246517,0.322625,0.356151,0.371337,0.393906,0.415897,0.463370,PHO,UTA,2003,2003-01-08
5,5,2002-12-20,200212200DET,25,0.509844,0.064183,0.085588,0.094555,0.102592,0.236860,0.324275,0.396995,0.448575,CLE,DET,2003,2002-12-20
6,6,2002-11-15,200211150TOR,8,0.480534,0.128259,0.096723,0.085149,0.099393,0.218206,0.323158,0.406325,0.480534,DEN,TOR,2003,2002-11-15
7,7,2003-03-05,200303050TOR,61,0.482657,0.011284,0.055341,0.142980,0.185435,0.219642,0.310209,0.408750,0.482657,TOR,HOU,2003,2003-03-05
8,8,2002-11-29,200211290UTA,17,0.499389,0.273386,0.322175,0.320348,0.308041,0.305775,0.323172,0.372863,0.438703,UTA,MIN,2003,2002-11-29
9,9,2002-11-18,200211180NJN,10,0.282987,0.317014,0.247150,0.205212,0.214549,0.221397,0.221494,0.253166,0.270747,NJN,DEN,2003,2002-11-18


In [51]:
len(reg_df)

2378

In [44]:
len(np.unique(game_df['g_id']))

1189

In [45]:
wins_df = pd.read_excel('data/win_data/win_data_2003.xlsx')

In [19]:
wins_df.head()

Unnamed: 0,date,g_id,score1,score2,team1,team2,winner,season
0,2002-10-29,200210290LAL,87,82,SAS,LAL,SAS,2003
0,2002-10-29,200210290ORL,88,95,PHI,ORL,ORL,2003
0,2002-10-29,200210290SAC,67,94,CLE,SAC,SAC,2003
0,2002-10-30,200210300BOS,99,96,CHI,BOS,CHI,2003
0,2002-10-30,200210300DET,77,86,NYK,DET,DET,2003


In [16]:
s_res = pd.read_excel('data/nba_results.xlsx')

In [17]:
s_res.head()

Unnamed: 0,Team,W,L,WL_pct,PW,PL,PS/G,PA/G,yr,abbr,playoffs
0,Boston Celtics,53,29,0.646,48,34,108.0,105.4,2017,BOS,3
1,Cleveland Cavaliers,51,31,0.622,49,33,110.3,107.2,2017,CLE,4
2,Toronto Raptors,51,31,0.622,52,30,106.9,102.6,2017,TOR,2
3,Washington Wizards,49,33,0.598,46,36,109.2,107.4,2017,WAS,2
4,Atlanta Hawks,43,39,0.524,39,43,103.2,104.0,2017,ATL,1
