## Hypothesis Testing

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
# package with hypothesis tests
import scipy.stats as st

### Data

You can download the data from [**here**](https://drive.google.com/file/d/19b9lHlkixZhs8yka8zV0QFieao66dUcY/view?usp=sharing). The data contains results of NBA games from seasons 2013/2014 to 2015/2016.

In [2]:
NBA_df = pd.read_csv('nba_games_2013_2015.csv', sep=';')

In [3]:
NBA_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,56,86,0.651,13,28,0.464,19,23,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,37,87,0.425,7,23,0.304,11,13,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,37,86,0.43,5,14,0.357,19,22,0.864,10,29,39,20,7,3,7,20,10.0
3,22015,1610612747,LAL,Los Angeles Lakers,21501228,2016-04-13,LAL vs. UTA,W,239,101,41,85,0.482,6,25,0.24,13,15,0.867,8,39,47,19,6,3,13,17,5.0
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,46,97,0.474,7,18,0.389,11,15,0.733,8,35,43,21,4,7,10,23,-2.0


In [4]:
NBA_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [5]:
NBA_df['SEASON_ID'].value_counts()

22013    2460
22015    2460
22014    2460
Name: SEASON_ID, dtype: int64

### Task
Split the data into **3** separate files according to the season!

In [6]:
NBA_grouped = NBA_df.groupby(NBA_df.SEASON_ID)

In [7]:
season_1 = NBA_grouped.get_group(22013) 

In [8]:
season_2 = NBA_grouped.get_group(22014)

In [9]:
season_3 = NBA_grouped.get_group(22015)

### Task
Test the hypothesis that offensive productions of Cleveland Cavaliers and Golden State Warriors (teams that met in the finals that year) were distributed equally in 2015/2016.

Offensive production consists of two variables: PTS (Points) and FG_PCT (Field Goal Percentage). We need to do two separate hypothesis tests, one for each variable.

In [10]:
season_3.columns
# looking at columns PTS & FG_PCT

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [12]:
cavaliers = season_3['TEAM_NAME'] == 'Cleveland Cavaliers' 
warriors = season_3['TEAM_NAME'] == 'Golden State Warriors'
alpha = 0.05

In [13]:
test_PTS = st.ttest_ind(season_3[cavaliers]['PTS'], season_3[warriors]['PTS'])
if test_PTS.pvalue > alpha:
    print("do not reject null hypothesis")
else:
    print("reject null hypothesis")
test_PTS.pvalue

reject null hypothesis


1.4233420547764935e-08

In [14]:
test_FG_PCT = st.ttest_ind(season_3[cavaliers]['FG_PCT'], season_3[warriors]['FG_PCT'])
if test_FG_PCT.pvalue > alpha:
    print("do not reject null hypothesis")
else:
    print("reject null hypothesis")
test_FG_PCT.pvalue

reject null hypothesis


0.00206097581047554

### Task
Test the hypothesis that the number of points (PTS) scored by Cleveland Cavaliers changed significantly after the coach change in the season 2015/2016. **Coach Blatt was fired on 24th of Jan, 2016**. Use the data from seasons 2014/2015 and 2015/2016 - those are years when Cleveland was coached by Blatt. 

We have two possible solutions here:
- take the same amount of games from before and after and try t-test.
- take all the games from before and after and look for the right test to compare two samples with different sizes

In [15]:
# get all games in season 3 after blatt was fired
# get an equal amounts of game season 2
# season_3[after_fire_clev].count()
alpha = 0.05

In [16]:
after_fire_cav = (season_3['TEAM_NAME'] == 'Cleveland Cavaliers') & (season_3['GAME_DATE'] > '2016-01-24')
# theres 40 games
cav_s2 = (season_2['TEAM_NAME'] == 'Cleveland Cavaliers')
before_fire_cav = season_2[cav_s2][:40]

In [17]:
test_PTS_blatt = st.ttest_ind(season_3[after_fire_cav]['PTS'], before_fire_cav['PTS'])
if test_PTS_blatt.pvalue > alpha:
    print("do not reject null hypothesis")   # chance vocab
else:
    print("reject null hypothesis")
test_PTS_blatt

do not reject null hypothesis


Ttest_indResult(statistic=0.9642175437586037, pvalue=0.3379167999000421)

### Task
Download [**the similar dataset**](https://drive.google.com/file/d/1jY57bAOZp9y83b4W2PAoSH1uFARaxxls/view?usp=sharing) with scores from playoff games in 2016.

In [19]:
NBA_2016 = pd.read_csv('nba_playoff_games_2016.csv', sep=';')

In [20]:
NBA_2016.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42015,1610612739,CLE,Cleveland Cavaliers,41500407,2016-06-19,CLE @ GSW,W,241,93,33,82,0.402,6,25,0.24,21,25,0.84,9,39,48,17,7,6,11,15,4.0
1,42015,1610612744,GSW,Golden State Warriors,41500407,2016-06-19,GSW vs. CLE,L,239,89,32,83,0.386,15,41,0.366,10,13,0.769,7,32,39,22,7,5,10,23,-4.0
2,42015,1610612744,GSW,Golden State Warriors,41500406,2016-06-16,GSW @ CLE,L,238,101,33,82,0.402,15,39,0.385,20,29,0.69,9,26,35,19,5,3,14,25,-14.0
3,42015,1610612739,CLE,Cleveland Cavaliers,41500406,2016-06-16,CLE vs. GSW,W,240,115,40,77,0.519,10,27,0.37,25,32,0.781,8,37,45,24,12,7,10,25,14.0
4,42015,1610612739,CLE,Cleveland Cavaliers,41500405,2016-06-13,CLE @ GSW,W,241,112,44,83,0.53,10,24,0.417,14,23,0.609,8,33,41,15,11,9,16,22,15.0


### Task
Test the hypothesis that number of blocks (BLK) are from the same distribution in both, in playoffs and in the reguar season 2015/2016 for **Toronto Raptors**. We need to work with two samples with different sizes again.

In [21]:
playoff_raptors = NBA_2016['TEAM_NAME'] == 'Toronto Raptors'
# there's 20 playoff Raptors games
raptors = season_3['TEAM_NAME'] == 'Toronto Raptors'
season_3[raptors][0:20].head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
9,22015,1610612761,TOR,Toronto Raptors,21501218,2016-04-13,TOR @ BKN,W,241,103,41,93,0.441,13,37,0.351,8,15,0.533,16,41,57,24,5,10,9,12,7.0
28,22015,1610612761,TOR,Toronto Raptors,21501214,2016-04-12,TOR vs. PHI,W,239,122,48,95,0.505,10,25,0.4,16,19,0.842,16,36,52,25,11,5,13,20,24.0
71,22015,1610612761,TOR,Toronto Raptors,21501202,2016-04-10,TOR @ NYK,W,240,93,37,78,0.474,7,20,0.35,12,19,0.632,7,31,38,19,4,6,6,19,4.0
96,22015,1610612761,TOR,Toronto Raptors,21501183,2016-04-08,TOR vs. IND,W,241,111,37,68,0.544,10,22,0.455,27,38,0.711,5,31,36,21,4,5,16,19,13.0
110,22015,1610612761,TOR,Toronto Raptors,21501173,2016-04-07,TOR @ ATL,L,240,87,32,84,0.381,8,21,0.381,15,20,0.75,8,38,46,16,10,4,11,16,-8.0


In [208]:
alpha = 0.05

In [22]:
test_BLK = st.ttest_ind(NBA_2016[playoff_raptors]['BLK'], season_3[raptors][0:20]['BLK'])
if test_BLK.pvalue > alpha:
    print("do not reject null hypothesis")
else:
    print("reject null hypothesis")
test_BLK.pvalue    

reject null hypothesis


0.00013696208843433274

### Task
Test the hypothesis that the number of points (PTS) scored by Cleveland Cavaliers is equally distributed for all 3 seasons. In this case, we need a hypothesis test that compares more than 2 distributions at the same.

In [23]:
cavaliers1 = season_1['TEAM_NAME'] == 'Cleveland Cavaliers'

In [24]:
cavaliers2 = season_2['TEAM_NAME'] == 'Cleveland Cavaliers'

In [25]:
cavaliers3 = season_3['TEAM_NAME'] == 'Cleveland Cavaliers'

In [27]:
anova_cavaliers = st.f_oneway(season_1[cavaliers1]['PTS'], season_2[cavaliers2]['PTS'], season_3[cavaliers]['PTS'])
if anova_cavaliers.pvalue > alpha:
    print("do not rejecr null hypothesis null hypothesis")
else:
    print("reject null hypothesis")
anova_cavaliers.pvalue

reject null hypothesis


0.003087727119983984

#### Follow Up
Between which seasons can we see the significant difference?
+ unfortunatelly, this is not the output of the ANOVA test and further tests need to be run.
+ note that Lebron James came back to Cleveland prior to season 2014/2015. We can use this fact to interpret the results correctly.

In [230]:
# need post hoc analysis? haven't seen how to do that yet

In [237]:
# from our anova we know that the number of points (PTS) for cavaliers across the 3 seasons are not equally distributed
# however, we don't know which distributions differ

In [28]:
test1 = st.ttest_ind(season_2[cavaliers2]['PTS'], season_3[cavaliers3]['PTS'])
if test1.pvalue > alpha:
    print("do not reject null hypothesis")
else:
    print("reject null hypothese")
test1.pvalue

do not reject null hypothesis


0.5203507617734474

In [232]:
# since LeBron James came back in season 2, from test1 looks like season 2 & 3 PTS are equally distributed, so likely that
# their distribution differ from season 1 PTS