<a href="https://colab.research.google.com/github/dimitramuni/NBA-Player-Profile-analysis/blob/main/Script/NBA_Hypothesis_Test_(T_test).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import calendar as cal
import scipy.stats 

class NBA_HypothesisTtest:
  '''
      pandas dataframe must have following features 
      for the given team
      ['Date', 'Sep', 'Opp', 'Result', 
       'TeamScore', 'OpponentScore', 'FG', 
       'FGA', 'FG%', '3P', '3PA', '3P%', 
       'FT', 'FTA', 'FT%', 'ORB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF']
        
      reference:https://www.basketball
      -reference.com/teams/POR/1999/gamelog/
  '''

  def __init__(self,df_gamelog):
    

    #new column indicating Home or Away game 
    df_gamelog['HomeAway']=np.where\
          (df_gamelog['Sep']=='@','Away','Home')
     
    #converting the date column to pandas datetime format
    df_gamelog['Date']=pd.to_datetime(df_gamelog['Date'])   

    #creating a new column indicating the 
    #difference in score for Portland against a specific team
    df_gamelog['DiffScore']=df_gamelog['TeamScore']\
    -df_gamelog['OpponentScore']

    #creating a variable inside the class for further methods
    self.df_gamelog=df_gamelog
    
    #Finding all indices of home and away games in the dataframe
    self.home_inds=np.where(df_gamelog['HomeAway']=='Home')
    self.away_inds=np.where(df_gamelog['HomeAway']=='Away')
 

  def ttest(self,x1,x2):
  
    #calcuating student t-test statistics 

    #length of sequences x1 and x2
    n1=len(x1)
    n2=len(x2)

    #variance of sequences x1 and x2
    v1=np.var(x1)
    v2=np.var(x2)
    
    #mean of sequences x1 and x2
    m1=np.mean(x1)
    m2=np.mean(x2)

    num=np.abs(m1-m2)
    denom= np.sqrt((v1/n1)+(v2/n2))
     
    #if the variance v1 or/and v2 are 0, return None 
    if denom>0:
      t_value=num/denom
      return t_value
    
    else:
      return None

  def hypothesis_test(self,metric='DiffScore',
                      opponent_teams=['LAC','LAL',
                                      'PHO','SAS','GSW','DEN'],
                       significance_level=0.95,
                       no_games=10):
    
    #creating to separate dataframe for
    #home games and away games for further analysis
    home_games=self.df_gamelog.iloc[self.home_inds]
    away_games=self.df_gamelog.iloc[self.away_inds]

    #list which will be used for storing 
    #the team tags for which null hypothesis 
    #is rejected (or not rejected). 
    h_reject=[]
    h_dnreject=[]  

    for team in opponent_teams:
      
      print('\nTeam ',team)
      
      #finding the indicies where 
      #Portland plays against a specific team
      
      
      home_game_inds=np.where(home_games['Opp']==team)   
      away_game_inds=np.where(away_games['Opp']==team)   
      
      #finding the metric values for home and 
      #away games,to be used for ttest 
      x_home=home_games.iloc[home_game_inds][metric]     
      x_away=away_games.iloc[away_game_inds][metric]
      print(len(x_home))
      print(len(x_away))
      
      #selecting equal number of samples from home and away 
      if(len(x_home)==len(x_away)):
        x_home=x_home[:no_games]
        x_away=x_away[:no_games]
        #print('both are of the same length')
      if(len(x_home)>len(x_away)):
        x_home=x_home[:len(x_away)][:no_games]
      if(len(x_home)<len(x_away)):
        x_away=x_away[:len(x_home)][:no_games]

      print(len(x_home))
      print(len(x_away))
      #calling the class method ttest
      t_stat=self.ttest(x_home,x_away)
      #print('t stat',t_stat)
      '''
      finding out the t value at 
      given level of certainity for 
      sample size n1 and n2, 
      lengths of each sample
      '''
      n1=len(x_home)
      n2=len(x_away) 
      #using scipy.stats library to find the t-value
      t_val=scipy.stats.t.ppf\
      (significance_level,df=n1+n2-2)
      #print('t value',t_val)

      #finding the teams for which null
      #hypothesis was rejected or not rejected
      if (t_stat>t_val):
        h_reject.append(team)  
      else:
        h_dnreject.append(team)

    print('\nFor the metric',metric)
    print('Null Hypothesis is rejected ',h_reject)  
    print('Null Hypothesis cannot be rejected',h_dnreject)   

  

# Homecrowd advantage against a specific team using student t-test

* $H_0:-$ For Portland playing at Rosegarden has no effect on winning the game against a team.
* $H_1:-$ There is some statistically significant advantage to play at home against a specific team. 

In [None]:
df=pd.read_csv('/content/homeaway_updated.csv')
#ignoring the matches from season 1998-99
df=df.drop([0,1])

ob=NBA_HypothesisTtest(df)
ob.hypothesis_test(metric='DiffScore',significance_level=0.95)



Team  LAC
10
10
10
10

Team  LAL
10
10
10
10

Team  PHO
10
10
10
10

Team  SAS
10
10
10
10

Team  GSW
10
10
10
10

Team  DEN
10
10
10
10

For the metric DiffScore
Null Hypothesis is rejected  ['LAC', 'LAL']
Null Hypothesis cannot be rejected ['PHO', 'SAS', 'GSW', 'DEN']


# Hypothesis test: significance of games against a certain team for FG% ?

* $H_0:-$ For Portland playing at Rosegarden has **no effect** on FG% against a certain team .
* $H_1:-$ There is some statistically significant difference in FG% for Portland when playing against a certain at home or away.

In [None]:
ob.hypothesis_test(metric='FG%',significance_level=0.95)
     


Team  LAC
10
10
10
10

Team  LAL
10
10
10
10

Team  PHO
10
10
10
10

Team  SAS
10
10
10
10

Team  GSW
10
10
10
10

Team  DEN
10
10
10
10

For the metric FG%
Null Hypothesis is rejected  ['LAC', 'DEN']
Null Hypothesis cannot be rejected ['LAL', 'PHO', 'SAS', 'GSW']


# Hypothesis test: significance of games against a certain team for FT% ?

* $H_0:-$ For Portland playing at Rosegarden has **no effect** on FT% against a certain team .
* $H_1:-$ There is some statistically significant difference in FT% for Portland when playing against a certain at home or away.

In [None]:
ob.hypothesis_test(metric='FT%',significance_level=0.95)


Team  LAC
10
10
10
10

Team  LAL
10
10
10
10

Team  PHO
10
10
10
10

Team  SAS
10
10
10
10

Team  GSW
10
10
10
10

Team  DEN
10
10
10
10

For the metric FT%
Null Hypothesis is rejected  []
Null Hypothesis cannot be rejected ['LAC', 'LAL', 'PHO', 'SAS', 'GSW', 'DEN']


# Hypothesis test: significance of games against a certain team for ORB ?

* $H_0:-$ For Portland playing at Rosegarden has **no effect** on ORB against a certain team .
* $H_1:-$ There is some statistically significant difference in ORB for Portland when playing against a certain at home or away.

In [None]:
ob.hypothesis_test(metric='ORB',significance_level=0.95)


Team  LAC
10
10
10
10

Team  LAL
10
10
10
10

Team  PHO
10
10
10
10

Team  SAS
10
10
10
10

Team  GSW
10
10
10
10

Team  DEN
10
10
10
10

For the metric ORB
Null Hypothesis is rejected  ['LAL', 'PHO', 'DEN']
Null Hypothesis cannot be rejected ['LAC', 'SAS', 'GSW']
