In [173]:
import pandas as pd
import numpy as np
import os
import datetime 
# package with hypothesis tests
import scipy.stats as st

## Data

You can download the data from [**here**](https://drive.google.com/file/d/19b9lHlkixZhs8yka8zV0QFieao66dUcY/view?usp=sharing). There are results of NBA games from seasons 2013 to 2015.

In [174]:
data = pd.read_csv("nba_games_2013_2015.csv",delimiter=';') 
# 7380 rows
data.head(3)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,...,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,...,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,...,0.864,10,29,39,20,7,3,7,20,10.0


In [185]:
data['date'] = data['GAME_DATE'].astype('datetime64')
data['date'][0]

Timestamp('2016-04-13 00:00:00')

### Split the data into 3 separate files according to the season!

In [None]:
data_2013 = data[data['SEASON_ID']==22013]
data_2014 = data[data['SEASON_ID']==22014]
data_2015 = data[data['SEASON_ID']==22015]

### Test, if the hypothesis that offensive production of Cleveland Cavaliers and Golden State Warriors (teams from finals) was distributed equally in 2015/2016

Do two separate tests for PTS and FG_PCT

In [43]:
# 2015 data
CLE_2014_2015 = data_2014[data_2014['TEAM_ABBREVIATION']=='CLE']
GSW_2014_2015 = data_2014[data_2014['TEAM_ABBREVIATION']=='GSW']

# 2016 data
CLE_2015_2016 = data_2015[data_2015['TEAM_ABBREVIATION']=='CLE']
GSW_2015_2016 = data_2015[data_2015['TEAM_ABBREVIATION']=='GSW']

In [143]:
# Assuming equal variance
result = st.ttest_ind(CLE_2015_2016['PTS'],GSW_2015_2016['PTS'])

# Assuming un equal variance
result2 = st.ttest_ind(CLE_2015_2016['PTS'],GSW_2015_2016['PTS'], equal_var=False)

print(result)
print(result2)

Ttest_indResult(statistic=-5.973920595717936, pvalue=1.4233420547764935e-08)
Ttest_indResult(statistic=-5.973920595717936, pvalue=1.430265283752963e-08)


### Test whether production (PTS) of Cleveland changes significantly before and after coach change in 2015/2016 - Use ony data from seasons 2014/2015 and 2015/2016 - Those are when cleveland was coached by Blatt
#### Coach Blatt was fired on 24th of Jan, 2016

We have two possible solutions here:
- take the same amount of games from before and after and try t-test.
- take all the games from before and after and look for the right test to compare two samples with different sizes

In [147]:
coach_clev = pd.concat([CLE_2014_2015,CLE_2015_2016])
coach_clev['DATE'] = coach_clev['GAME_DATE'].str.replace('-', '')
coach_clev['DATE'] = coach_clev['DATE'].astype(int)

# games from not coached by BLATT (40 rows)
coach_after = coach_clev[coach_clev['DATE'] > 20160124]

# games coached by BLATT (124 rows)
coach_before = coach_clev[(coach_clev['DATE'] < 20160124) & (20140101 < coach_clev['DATE'])]


In [168]:
# normality
print(st.shapiro(coach_before_40['PTS']))
print(st.shapiro(coach_after['PTS']))

ShapiroResult(statistic=0.9810476303100586, pvalue=0.7283441424369812)
ShapiroResult(statistic=0.9679392576217651, pvalue=0.30906569957733154)


In [170]:
# same sample size
coach_before_40 = coach_before.sample(n=40,random_state=99)

# pooled t test
pooled = st.ttest_rel(coach_before_40['PTS'],coach_after['PTS'], nan_policy='omit')

# Assuming same variance
equal_var = st.ttest_ind(coach_before_40['PTS'],coach_after['PTS'],equal_var=True, nan_policy='omit')

# Assuming different variance (OR welch t test, different sample size)
non_equal_var = st.ttest_ind(coach_before['PTS'],coach_after['PTS'],equal_var=False, nan_policy='omit')

print(pooled)
print(equal_var)
print(non_equal_var)
print(st.kruskal(coach_before['PTS'],coach_after['PTS']))

Ttest_relResult(statistic=-1.612235944130793, pvalue=0.11497489793264457)
Ttest_indResult(statistic=-1.640443513044006, pvalue=0.10493985629983593)
Ttest_indResult(statistic=-3.020359501985649, pvalue=0.003290500194598496)
KruskalResult(statistic=7.961718139705661, pvalue=0.004777703982670637)


Download the same dataset for playoffs games in 2016 from [**here**](https://drive.google.com/file/d/1jY57bAOZp9y83b4W2PAoSH1uFARaxxls/view?usp=sharing)

### Select Toronto Raptors and test the hypothesis that number of blocks in playoffs are from the same distribution as in reguar season 2015/2016
we again have two samples with different sizes

In [149]:
data2 = pd.read_csv("nba_playoff_games_2016.csv",delimiter=';') 

In [152]:
raptor_2016 = data2[data2.TEAM_NAME == "Toronto Raptors"]["BLK"]
raptor_2015 = data_2015[data_2015.TEAM_NAME =='Toronto Raptors']['BLK']

In [156]:
raptor = st.ttest_ind(raptor_2016,raptor_2015,equal_var=False, nan_policy='omit')
raptor

Ttest_indResult(statistic=-3.500438136870473, pvalue=0.0011403514552816168)

In [169]:
st.kruskal(raptor_2016,raptor_2015)

KruskalResult(statistic=7.483641456983013, pvalue=0.006226202691250736)

### Test the hypothesis that points per game are equally distributed in all 3 seasons for Cleveland
we need a hypothesis test to compare more than 2 distributions

In [161]:
CLE_2013 = data_2013[data_2013['TEAM_ABBREVIATION']=='CLE']['PTS']
CLE_2014 = data_2014[data_2014['TEAM_ABBREVIATION']=='CLE']['PTS']
CLE_2015 = data_2015[data_2015['TEAM_ABBREVIATION']=='CLE']['PTS']

In [162]:
st.f_oneway(CLE_2013,CLE_2014,CLE_2015)

F_onewayResult(statistic=5.9200250318080885, pvalue=0.003087727119983984)

#### Between which seasons, we can see the significant difference?
+ unfortunatelly, this is not the output of ANOVA test and further analysis needs to be applied in most of the cases
+ Note that Lebron James came back to Cleveland prior to season 2014/2015 (just for interpretation of the results)

In [163]:
st.ttest_ind(CLE_2013,CLE_2014)

Ttest_indResult(statistic=-2.508958204796911, pvalue=0.013091680534336523)

In [164]:
st.ttest_ind(CLE_2014,CLE_2015)

Ttest_indResult(statistic=-0.6442093460555935, pvalue=0.5203507617734474)

In [165]:
st.ttest_ind(CLE_2013,CLE_2015)

Ttest_indResult(statistic=-3.339057501969076, pvalue=0.001043164899206325)